regex/TRE/lib/tre-match-utils.h

   1 /*
   2   tre-match-utils.h - TRE matcher helper definitions
   3
   4   This software is released under a BSD-style license.
   5   See the file LICENSE for details and copyright.
   6
   7 */
   8
   9 #include "tre-internal.h"
  10
  11 #define str_source ((const tre_str_source*)string)
  12
  13 #ifdef TRE_WCHAR
  14
  15 #ifdef TRE_MULTIBYTE
  16
  17 /* Wide character and multibyte support. */
  18
  19 #ifdef TRE_STR_USER
  20 #error TRE_STR_USER defined
  21 #define GET_NEXT_WCHAR()                                                      \
  22   do {                                                                        \
  23     prev_c = next_c;                                                          \
  24     if (type == STR_BYTE)                                                     \
  25       {                                                                       \
  26         pos++;                                                                \
  27         if (len >= 0 && pos >= len)                                           \
  28           next_c = '\0';                                                      \
  29         else                                                                  \
  30           next_c = (unsigned char)(*str_byte++);                              \
  31       }                                                                       \
  32     else if (type == STR_WIDE)                                                \
  33       {                                                                       \
  34         pos++;                                                                \
  35         if (len >= 0 && pos >= len)                                           \
  36           next_c = L'\0';                                                     \
  37         else                                                                  \
  38           next_c = *str_wide++;                                               \
  39       }                                                                       \
  40     else if (type == STR_MBS)                                                 \
  41       {                                                                       \
  42         pos += pos_add_next;                                                  \
  43         if (str_byte == NULL)                                                 \
  44           next_c = L'\0';                                                     \
  45         else                                                                  \
  46           {                                                                   \
  47             size_t w;                                                         \
  48             int max;                                                          \
  49             if (len >= 0)                                                     \
  50               max = len - pos;                                                \
  51             else                                                              \
  52               max = 32;                                                       \
  53             if (max <= 0)                                                     \
  54               {                                                               \
  55                 next_c = L'\0';                                               \
  56                 pos_add_next = 1;                                             \
  57               }                                                               \
  58             else                                                              \
  59               {                                                               \
  60                 w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate,   \
  61                                   tnfa->loc);                                 \
  62                 if (w == (size_t)-1 || w == (size_t)-2)                       \
  63                   return REG_ILLSEQ;                                          \
  64                 if (w == 0 && len >= 0)                                       \
  65                   {                                                           \
  66                     pos_add_next = 1;                                         \
  67                     next_c = 0;                                               \
  68                     str_byte++;                                               \
  69                   }                                                           \
  70                 else                                                          \
  71                   {                                                           \
  72                     pos_add_next = w;                                         \
  73                     str_byte += w;                                            \
  74                   }                                                           \
  75               }                                                               \
  76           }                                                                   \
  77       }                                                                       \
  78     else if (type == STR_USER)                                                \
  79       {                                                                       \
  80         pos += pos_add_next;                                                  \
  81         str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
  82                                                  str_source->context);        \
  83       }                                                                       \
  84   } while(/*CONSTCOND*/0)
  85 #else /* !TRE_STR_USER */
  86 /*
  87  * Because all multibyte encodings are exclusively single-shift encoding,
  88  * with the shift codes having the high bit set, we can make an optimization
  89  * for STR_MBS that only calls tre_mbrtowc_l() when a high-bit character
  90  * is detected, and just do a direct copy for ASCII characters.
  91  */
  92 #define GET_NEXT_WCHAR()                                                      \
  93   do {                                                                        \
  94     prev_c = next_c;                                                          \
  95     switch (type)                                                             \
  96       {                                                                       \
  97       case STR_BYTE:                                                          \
  98         pos++;                                                                \
  99         if (len >= 0 && pos >= len)                                           \
 100           next_c = '\0';                                                      \
 101         else                                                                  \
 102           next_c = (unsigned char)(*str_byte++);                              \
 103         break;                                                                \
 104       case STR_WIDE:                                                          \
 105         pos++;                                                                \
 106         if (len >= 0 && pos >= len)                                           \
 107           next_c = L'\0';                                                     \
 108         else                                                                  \
 109           next_c = *str_wide++;                                               \
 110         break;                                                                \
 111       case STR_MBS:                                                           \
 112         pos += pos_add_next;                                                  \
 113         if (__builtin_expect(len >= 0 && pos >= len, 0))                      \
 114           {                                                                   \
 115             next_c = L'\0';                                                   \
 116             pos_add_next = 1;                                                 \
 117           }                                                                   \
 118         else if (__builtin_expect(!(*str_byte & 0x80), 1))                    \
 119           {                                                                   \
 120             next_c = (unsigned char)(*str_byte++);                            \
 121             pos_add_next = 1;                                                 \
 122           }                                                                   \
 123         else                                                                  \
 124           {                                                                   \
 125             size_t w;                                                         \
 126             int max;                                                          \
 127             if (len >= 0)                                                     \
 128               max = len - pos;                                                \
 129             else                                                              \
 130               max = 32;                                                       \
 131             w = tre_mbrtowc_l(&next_c, str_byte, (size_t)max, &mbstate,       \
 132                               tnfa->loc);                                     \
 133             if (w == (size_t)-1 || w == (size_t)-2)                           \
 134               return REG_ILLSEQ;                                              \
 135             if (w == 0 && len >= 0)                                           \
 136               {                                                               \
 137                 pos_add_next = 1;                                             \
 138                 next_c = 0;                                                   \
 139                 str_byte++;                                                   \
 140               }                                                               \
 141             else                                                              \
 142               {                                                               \
 143                 pos_add_next = w;                                             \
 144                 str_byte += w;                                                \
 145               }                                                               \
 146           }                                                                   \
 147         break;                                                                \
 148       }                                                                       \
 149   } while(/*CONSTCOND*/0)
 150 #endif /* !TRE_STR_USER */
 151
 152 #else /* !TRE_MULTIBYTE */
 153
 154 /* Wide character support, no multibyte support. */
 155 #error TRE_MULTIBYTE undefined
 156
 157 #ifdef TRE_STR_USER
 158 #define GET_NEXT_WCHAR()                                                      \
 159   do {                                                                        \
 160     prev_c = next_c;                                                          \
 161     if (type == STR_BYTE)                                                     \
 162       {                                                                       \
 163         pos++;                                                                \
 164         if (len >= 0 && pos >= len)                                           \
 165           next_c = '\0';                                                      \
 166         else                                                                  \
 167           next_c = (unsigned char)(*str_byte++);                              \
 168       }                                                                       \
 169     else if (type == STR_WIDE)                                                \
 170       {                                                                       \
 171         pos++;                                                                \
 172         if (len >= 0 && pos >= len)                                           \
 173           next_c = L'\0';                                                     \
 174         else                                                                  \
 175           next_c = *str_wide++;                                               \
 176       }                                                                       \
 177     else if (type == STR_USER)                                                \
 178       {                                                                       \
 179         pos += pos_add_next;                                                  \
 180         str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
 181                                                  str_source->context);        \
 182       }                                                                       \
 183   } while(/*CONSTCOND*/0)
 184 #else /* !TRE_STR_USER */
 185 #define GET_NEXT_WCHAR()                                                      \
 186   do {                                                                        \
 187     prev_c = next_c;                                                          \
 188     if (type == STR_BYTE)                                                     \
 189       {                                                                       \
 190         pos++;                                                                \
 191         if (len >= 0 && pos >= len)                                           \
 192           next_c = '\0';                                                      \
 193         else                                                                  \
 194           next_c = (unsigned char)(*str_byte++);                              \
 195       }                                                                       \
 196     else if (type == STR_WIDE)                                                \
 197       {                                                                       \
 198         pos++;                                                                \
 199         if (len >= 0 && pos >= len)                                           \
 200           next_c = L'\0';                                                     \
 201         else                                                                  \
 202           next_c = *str_wide++;                                               \
 203       }                                                                       \
 204   } while(/*CONSTCOND*/0)
 205 #endif /* !TRE_STR_USER */
 206
 207 #endif /* !TRE_MULTIBYTE */
 208
 209 #else /* !TRE_WCHAR */
 210
 211 /* No wide character or multibyte support. */
 212 #error TRE_WCHAR undefined
 213
 214 #ifdef TRE_STR_USER
 215 #define GET_NEXT_WCHAR()                                                      \
 216   do {                                                                        \
 217     prev_c = next_c;                                                          \
 218     if (type == STR_BYTE)                                                     \
 219       {                                                                       \
 220         pos++;                                                                \
 221         if (len >= 0 && pos >= len)                                           \
 222           next_c = '\0';                                                      \
 223         else                                                                  \
 224           next_c = (unsigned char)(*str_byte++);                              \
 225       }                                                                       \
 226     else if (type == STR_USER)                                                \
 227       {                                                                       \
 228         pos += pos_add_next;                                                  \
 229         str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
 230                                                  str_source->context);        \
 231       }                                                                       \
 232   } while(/*CONSTCOND*/0)
 233 #else /* !TRE_STR_USER */
 234 #define GET_NEXT_WCHAR()                                                      \
 235   do {                                                                        \
 236     prev_c = next_c;                                                          \
 237     if (type == STR_BYTE)                                                     \
 238       {                                                                       \
 239         pos++;                                                                \
 240         if (len >= 0 && pos >= len)                                           \
 241           next_c = '\0';                                                      \
 242         else                                                                  \
 243           next_c = (unsigned char)(*str_byte++);                              \
 244       }                                                                       \
 245   } while(/*CONSTCOND*/0)
 246 #endif /* !TRE_STR_USER */
 247
 248 #endif /* !TRE_WCHAR */
 249
 250
 251
 252 /* Assumes tre_tnfa_t *tnfa in scope */
 253 #define IS_WORD_CHAR(c)  ((c) == L'_' || tre_isalnum_l(c, tnfa->loc))
 254
 255 #define CHECK_ASSERTIONS(assertions)                                          \
 256   (((assertions & ASSERT_AT_BOL)                                              \
 257     && (pos > 0 || reg_notbol)                                                \
 258     && (prev_c != L'\n' || !reg_newline))                                     \
 259    || ((assertions & ASSERT_AT_EOL)                                           \
 260        && (next_c != L'\0' || reg_noteol)                                     \
 261        && (next_c != L'\n' || !reg_newline))                                  \
 262    || ((assertions & ASSERT_AT_BOW)                                           \
 263        && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))                    \
 264    || ((assertions & ASSERT_AT_EOW)                                           \
 265        && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))                    \
 266    || ((assertions & ASSERT_AT_WB)                                            \
 267        && (pos != 0 && next_c != L'\0'                                        \
 268            && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))                  \
 269    || ((assertions & ASSERT_AT_WB_NEG)                                        \
 270        && (pos == 0 || next_c == L'\0'                                        \
 271            || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
 272
 273 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
 274   ((trans_i->assertions & ASSERT_BRACKET_MATCH)                               \
 275     && !tre_bracket_match(trans_i->u.bracket_match_list,(tre_cint_t)prev_c,    \
 276                                       tnfa))
 277
 278
 279
 280
 281 inline static int
 282 tre_tag_get(const tre_tag_t *tags, int i)
 283 {
 284   tags += i;
 285   return tags->count > 0 ? tags->value : -1;
 286 }
 287
 288 inline static void
 289 tre_tag_set(tre_tag_t *tags, int i, int val, int touch)
 290 {
 291   tags += i;
 292   if (tags->count++ == 0)
 293     tags->first = val;
 294   tags->value = val;
 295   tags->touch = touch;
 296 }
 297
 298 inline static void
 299 tre_tag_reset(tre_tag_t *tags, int i)
 300 {
 301   tags[i].count = 0;
 302 }
 303
 304 inline static int
 305 tre_tag_touch_get(const tre_tag_t *tags, int i)
 306 {
 307   return tags[i].touch;
 308 }
 309
 310 #ifdef TRE_DEBUG
 311 inline static void
 312 tre_print_tags(const tre_tag_t *tags, int num_tags)
 313 {
 314   int i;
 315   for (i = 0; i < num_tags; i++, tags++)
 316     {
 317       switch(tags->count)
 318         {
 319         case 0:
 320           DPRINT(("%d:(0,-1)", i));
 321           break;
 322         case 1:
 323           DPRINT(("%d:(1,%d)", i, tags->first));
 324           break;
 325         default:
 326           DPRINT(("%d:(%d,%d,%d)", i, tags->count, tags->first,
 327                   tags->value));
 328           break;
 329         }
 330       if (i < (num_tags - 1))
 331         DPRINT((" "));
 332     }
 333 }
 334
 335 inline static void
 336 tre_print_tags_all(const tre_tag_t *tags, int num_tags)
 337 {
 338   int i;
 339   for (i = 0; i < num_tags; i++, tags++)
 340     {
 341       switch(tags->count)
 342         {
 343         case 0:
 344           DPRINT(("%d:(0,-1)/%d", i, tags->touch));
 345           break;
 346         case 1:
 347           DPRINT(("%d:(1,%d)/%d", i, tags->first, tags->touch));
 348           break;
 349         default:
 350           DPRINT(("%d:(%d,%d,%d)/%d", i, tags->count, tags->first,
 351                   tags->value, tags->touch));
 352           break;
 353         }
 354       if (i < (num_tags - 1))
 355         DPRINT((" "));
 356     }
 357 }
 358 #endif /* TRE_DEBUG */
 359
 360 /* Return < 0, = 0 or > 0 depending on how the start/end pairs of a minimal
 361  * group between t1 and t2 compare (t1 loses if < 0, t1 wins if > 0) */
 362 inline static int
 363 tre_minimal_tag_order(int start, int end, const tre_tag_t *tags1,
 364                       const tre_tag_t *tags2)
 365 {
 366   const tre_tag_t *t1, *t2;
 367
 368   t1 = tags1 + start;
 369   t2 = tags2 + start;
 370   /* We need both start tags to be set */
 371   if (t1->count == 0 || t2->count == 0)
 372     return 0;
 373
 374   /* The start tags must be equal */
 375   if (t1->value != t2->value)
 376     return 0;
 377
 378   t1 = tags1 + end;
 379   t2 = tags2 + end;
 380   /* For the end tags, we prefer set over unset, because unset means that
 381    * the end tag is still growing */
 382   if (t1->count == 0)
 383     {
 384       /* if t2 is set, t1 loses since it is unset */
 385       if (t2->count != 0)
 386         return -1;
 387     }
 388   /* if t2 not set, t1 wins since it is set */
 389   else if (t2->count == 0)
 390     return 1;
 391
 392   /* least current value wins */
 393   return t2->value - t1->value;
 394 }
 395
 396 /* Return < 0, = 0 or > 0 depending on how the i-th item of t1 and t2 compare
 397  * (t1 loses if < 0, t1 wins if > 0) */
 398 inline static int
 399 tre_tag_order_1(int i, tre_tag_direction_t dir, const tre_tag_t *t1,
 400                 const tre_tag_t *t2)
 401 {
 402   int diff;
 403
 404   t1 += i;
 405   t2 += i;
 406   switch (dir)
 407     {
 408     case TRE_TAG_MINIMIZE:
 409       /* least current value wins (because tags are initialized to all zeros,
 410        * unset wins over set; also, tre_minimal_tag_order() will have already
 411        * been run, which checks for being unset) */
 412       return t2->value - t1->value;
 413
 414     case TRE_TAG_MAXIMIZE:
 415       /* prefer set */
 416       if (t1->count == 0)
 417         {
 418           /* if neither t1 and t2 are set, try next tag */
 419           if (t2->count == 0)
 420             return 0;
 421           /* t2 is set, t1 loses since it is unset */
 422           return -1;
 423         }
 424       /* if t2 not set, t1 wins since it is set */
 425       else if (t2->count == 0)
 426         return 1;
 427       /* greatest initial value wins */
 428       if ((diff = t1->first - t2->first) != 0)
 429         return diff;
 430       /* least number of times the tag was set, wins */
 431       if ((diff = t2->count - t1->count) != 0)
 432         return diff;
 433       /* if the tags were only set once, they only have initial values */
 434       if (t1->count == 1)
 435         return 0;
 436       /* greatest current value wins */
 437       return t1->value - t2->value;
 438
 439     case TRE_TAG_LEFT_MAXIMIZE:
 440       /* prefer set */
 441       if (t1->count == 0)
 442         {
 443           /* if neither t1 and t2 are set, try next tag */
 444           if (t2->count == 0)
 445             return 0;
 446           /* t2 is set, t1 loses since it is unset */
 447           return -1;
 448         }
 449       /* if t2 not set, t1 wins since it is set */
 450       else if (t2->count == 0)
 451         return 1;
 452       /* least initial value wins */
 453       if ((diff = t2->first - t1->first) != 0)
 454         return diff;
 455       /* least number of times the tag was set, wins */
 456       if ((diff = t2->count - t1->count) != 0)
 457         return diff;
 458       /* if the tags were only set once, they only have initial values */
 459       if (t1->count == 1)
 460         return 0;
 461       /* greatest current value wins */
 462       return t1->value - t2->value;
 463
 464     default:
 465       /* Shouldn't happen: only assert if TRE_DEBUG defined */
 466       assert(0);
 467       break;
 468     }
 469   return 0;
 470 }
 471
 472 #ifdef TRE_DEBUG
 473 #define _MORE_DEBUGGING
 474 #endif /* TRE_DEBUG */
 475
 476 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
 477 inline static int
 478 #ifdef _MORE_DEBUGGING
 479 _tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
 480               const tre_tag_t *t1, const tre_tag_t *t2)
 481 #else /* !_MORE_DEBUGGING */
 482 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
 483               const tre_tag_t *t1, const tre_tag_t *t2)
 484 #endif /* !_MORE_DEBUGGING */
 485 {
 486   int i, ret;
 487
 488   for (i = 0; i < num_tags; i++)
 489     {
 490       if ((ret = tre_tag_order_1(i, tag_directions[i], t1, t2)) != 0)
 491         return (ret > 0);
 492     }
 493   /*  assert(0);*/
 494   return 0;
 495 }
 496
 497 #ifdef _MORE_DEBUGGING
 498 inline static int
 499 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
 500               const tre_tag_t *t1, const tre_tag_t *t2)
 501 {
 502   int ret = _tre_tag_order(num_tags, tag_directions, t1, t2);
 503   DPRINT(("tre_tag_order: "));
 504   tre_print_tags(t1, num_tags);
 505   DPRINT((" %s ", ret ? "wins" : "doesn't win"));
 506   tre_print_tags(t2, num_tags);
 507   DPRINT(("\n"));
 508   return ret;
 509 }
 510 #endif /* _MORE_DEBUGGING */
 511
 512 #ifdef __LIBC__
 513 #include <xlocale_private.h>
 514 #else /* !__LIBC__ */
 515 #include <xlocale.h>
 516 #endif /* !__LIBC__ */
 517
 518 int __collate_equiv_value(locale_t loc, const wchar_t *str, size_t len);
 519
 520 inline static int
 521 tre_bracket_match(tre_bracket_match_list_t * __restrict list, tre_cint_t wc,
 522                   const tre_tnfa_t * __restrict tnfa)
 523 {
 524   int match = 0;
 525   int i;
 526   tre_bracket_match_t *b;
 527   tre_cint_t uc, lc;
 528   int we, ue, le, got_equiv = 0;
 529   int icase = ((tnfa->cflags & REG_ICASE) != 0);
 530
 531   DPRINT(("tre_bracket_match: %p, %d, %d\n", list, wc, icase));
 532   if (icase)
 533     {
 534       if (tre_islower_l(wc, tnfa->loc))
 535         {
 536           lc = wc;
 537           uc = tre_toupper_l(wc, tnfa->loc);
 538         }
 539       else if (tre_isupper_l(wc, tnfa->loc))
 540         {
 541           uc = wc;
 542           lc = tre_tolower_l(wc, tnfa->loc);
 543         }
 544       else
 545         {
 546           icase = 0;
 547         }
 548     }
 549   for (i = 0, b = list->bracket_matches; i < list->num_bracket_matches;
 550        i++, b++)
 551     {
 552       switch (b->type)
 553         {
 554         case TRE_BRACKET_MATCH_TYPE_CHAR:
 555           if (icase)
 556             match = (b->value == uc || b->value == lc);
 557           else
 558             match = (b->value == wc);
 559           break;
 560         case TRE_BRACKET_MATCH_TYPE_RANGE_BEGIN:
 561           {
 562             tre_cint_t start = b->value, end;
 563             if (++i >= list->num_bracket_matches ||
 564                 (++b)->type != TRE_BRACKET_MATCH_TYPE_RANGE_END)
 565               {
 566                 DPRINT(("tre_bracket_match: no following range end\n"));
 567                 assert(0);
 568                 goto error;
 569               }
 570             end = b->value;
 571             if (!got_equiv)
 572               {
 573                 if (icase)
 574                   {
 575                     ue = __collate_equiv_value(tnfa->loc, &uc, 1);
 576                     le = __collate_equiv_value(tnfa->loc, &lc, 1);
 577                   }
 578                 else
 579                     we = __collate_equiv_value(tnfa->loc, &wc, 1);
 580                 got_equiv = 1;
 581               }
 582             if (icase)
 583               match = ((start <= ue && ue <= end) ||
 584                       (start <= le && le <= end));
 585             else
 586               match = (start <= we && we <= end);
 587             break;
 588           }
 589         case TRE_BRACKET_MATCH_TYPE_RANGE_END:
 590           DPRINT(("tre_bracket_match: range end without preceeding start\n"));
 591           assert(0);
 592           break;
 593         case TRE_BRACKET_MATCH_TYPE_CLASS:
 594           if (icase)
 595             match = (tre_isctype_l(uc, b->value, tnfa->loc) ||
 596                      tre_isctype_l(lc, b->value, tnfa->loc));
 597           else
 598             match = (tre_isctype_l(wc, b->value, tnfa->loc));
 599           break;
 600         case TRE_BRACKET_MATCH_TYPE_EQUIVALENCE:
 601           if (!got_equiv)
 602             {
 603               if (icase)
 604                 {
 605                   ue = __collate_equiv_value(tnfa->loc, &uc, 1);
 606                   le = __collate_equiv_value(tnfa->loc, &lc, 1);
 607                 }
 608               else
 609                   we = __collate_equiv_value(tnfa->loc, &wc, 1);
 610               got_equiv = 1;
 611             }
 612           if (icase)
 613             match = (b->value == ue || b->value == le);
 614           else
 615             match = (b->value == we);
 616           break;
 617         default:
 618           DPRINT(("tre_bracket_match: unknown type %d\n", b->type));
 619           assert(0);
 620           break;
 621         }
 622         if (match)
 623           break;
 624     }
 625 error:
 626   if (list->flags & TRE_BRACKET_MATCH_FLAG_NEGATE) {
 627     if ((tnfa->cflags & REG_NEWLINE) && wc == '\n') return 0;
 628     match = !match;
 629   }
 630   return match;
 631 }