src/src/pcre/pcre_study.c

   1 /* $Cambridge: exim/src/src/pcre/pcre_study.c,v 1.6 2007/11/12 13:02:20 nm4 Exp $ */
   2
   3 /*************************************************
   4 *      Perl-Compatible Regular Expressions       *
   5 *************************************************/
   6
   7 /* PCRE is a library of functions to support regular expressions whose syntax
   8 and semantics are as close as possible to those of the Perl 5 language.
   9
  10                        Written by Philip Hazel
  11            Copyright (c) 1997-2007 University of Cambridge
  12
  13 -----------------------------------------------------------------------------
  14 Redistribution and use in source and binary forms, with or without
  15 modification, are permitted provided that the following conditions are met:
  16
  17     * Redistributions of source code must retain the above copyright notice,
  18       this list of conditions and the following disclaimer.
  19
  20     * Redistributions in binary form must reproduce the above copyright
  21       notice, this list of conditions and the following disclaimer in the
  22       documentation and/or other materials provided with the distribution.
  23
  24     * Neither the name of the University of Cambridge nor the names of its
  25       contributors may be used to endorse or promote products derived from
  26       this software without specific prior written permission.
  27
  28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 POSSIBILITY OF SUCH DAMAGE.
  39 -----------------------------------------------------------------------------
  40 */
  41
  42
  43 /* This module contains the external function pcre_study(), along with local
  44 supporting functions. */
  45
  46
  47 #ifdef HAVE_CONFIG_H
  48 #include "config.h"
  49 #endif
  50
  51 #include "pcre_internal.h"
  52
  53
  54 /* Returns from set_start_bits() */
  55
  56 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
  57
  58
  59 /*************************************************
  60 *      Set a bit and maybe its alternate case    *
  61 *************************************************/
  62
  63 /* Given a character, set its bit in the table, and also the bit for the other
  64 version of a letter if we are caseless.
  65
  66 Arguments:
  67   start_bits    points to the bit map
  68   c             is the character
  69   caseless      the caseless flag
  70   cd            the block with char table pointers
  71
  72 Returns:        nothing
  73 */
  74
  75 static void
  76 set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
  77 {
  78 start_bits[c/8] |= (1 << (c&7));
  79 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
  80   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
  81 }
  82
  83
  84
  85 /*************************************************
  86 *          Create bitmap of starting bytes       *
  87 *************************************************/
  88
  89 /* This function scans a compiled unanchored expression recursively and
  90 attempts to build a bitmap of the set of possible starting bytes. As time goes
  91 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
  92 useful for parenthesized groups in patterns such as (a*)b where the group
  93 provides some optional starting bytes but scanning must continue at the outer
  94 level to find at least one mandatory byte. At the outermost level, this
  95 function fails unless the result is SSB_DONE.
  96
  97 Arguments:
  98   code         points to an expression
  99   start_bits   points to a 32-byte table, initialized to 0
 100   caseless     the current state of the caseless flag
 101   utf8         TRUE if in UTF-8 mode
 102   cd           the block with char table pointers
 103
 104 Returns:       SSB_FAIL     => Failed to find any starting bytes
 105                SSB_DONE     => Found mandatory starting bytes
 106                SSB_CONTINUE => Found optional starting bytes
 107 */
 108
 109 static int
 110 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
 111   BOOL utf8, compile_data *cd)
 112 {
 113 register int c;
 114 int yield = SSB_DONE;
 115
 116 #if 0
 117 /* ========================================================================= */
 118 /* The following comment and code was inserted in January 1999. In May 2006,
 119 when it was observed to cause compiler warnings about unused values, I took it
 120 out again. If anybody is still using OS/2, they will have to put it back
 121 manually. */
 122
 123 /* This next statement and the later reference to dummy are here in order to
 124 trick the optimizer of the IBM C compiler for OS/2 into generating correct
 125 code. Apparently IBM isn't going to fix the problem, and we would rather not
 126 disable optimization (in this module it actually makes a big difference, and
 127 the pcre module can use all the optimization it can get). */
 128
 129 volatile int dummy;
 130 /* ========================================================================= */
 131 #endif
 132
 133 do
 134   {
 135   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
 136   BOOL try_next = TRUE;
 137
 138   while (try_next)    /* Loop for items in this branch */
 139     {
 140     int rc;
 141     switch(*tcode)
 142       {
 143       /* Fail if we reach something we don't understand */
 144
 145       default:
 146       return SSB_FAIL;
 147
 148       /* If we hit a bracket or a positive lookahead assertion, recurse to set
 149       bits from within the subpattern. If it can't find anything, we have to
 150       give up. If it finds some mandatory character(s), we are done for this
 151       branch. Otherwise, carry on scanning after the subpattern. */
 152
 153       case OP_BRA:
 154       case OP_SBRA:
 155       case OP_CBRA:
 156       case OP_SCBRA:
 157       case OP_ONCE:
 158       case OP_ASSERT:
 159       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
 160       if (rc == SSB_FAIL) return SSB_FAIL;
 161       if (rc == SSB_DONE) try_next = FALSE; else
 162         {
 163         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 164         tcode += 1 + LINK_SIZE;
 165         }
 166       break;
 167
 168       /* If we hit ALT or KET, it means we haven't found anything mandatory in
 169       this branch, though we might have found something optional. For ALT, we
 170       continue with the next alternative, but we have to arrange that the final
 171       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
 172       return SSB_CONTINUE: if this is the top level, that indicates failure,
 173       but after a nested subpattern, it causes scanning to continue. */
 174
 175       case OP_ALT:
 176       yield = SSB_CONTINUE;
 177       try_next = FALSE;
 178       break;
 179
 180       case OP_KET:
 181       case OP_KETRMAX:
 182       case OP_KETRMIN:
 183       return SSB_CONTINUE;
 184
 185       /* Skip over callout */
 186
 187       case OP_CALLOUT:
 188       tcode += 2 + 2*LINK_SIZE;
 189       break;
 190
 191       /* Skip over lookbehind and negative lookahead assertions */
 192
 193       case OP_ASSERT_NOT:
 194       case OP_ASSERTBACK:
 195       case OP_ASSERTBACK_NOT:
 196       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
 197       tcode += 1 + LINK_SIZE;
 198       break;
 199
 200       /* Skip over an option setting, changing the caseless flag */
 201
 202       case OP_OPT:
 203       caseless = (tcode[1] & PCRE_CASELESS) != 0;
 204       tcode += 2;
 205       break;
 206
 207       /* BRAZERO does the bracket, but carries on. */
 208
 209       case OP_BRAZERO:
 210       case OP_BRAMINZERO:
 211       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
 212         return SSB_FAIL;
 213 /* =========================================================================
 214       See the comment at the head of this function concerning the next line,
 215       which was an old fudge for the benefit of OS/2.
 216       dummy = 1;
 217   ========================================================================= */
 218       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
 219       tcode += 1 + LINK_SIZE;
 220       break;
 221
 222       /* Single-char * or ? sets the bit and tries the next item */
 223
 224       case OP_STAR:
 225       case OP_MINSTAR:
 226       case OP_POSSTAR:
 227       case OP_QUERY:
 228       case OP_MINQUERY:
 229       case OP_POSQUERY:
 230       set_bit(start_bits, tcode[1], caseless, cd);
 231       tcode += 2;
 232 #ifdef SUPPORT_UTF8
 233       if (utf8 && tcode[-1] >= 0xc0)
 234         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
 235 #endif
 236       break;
 237
 238       /* Single-char upto sets the bit and tries the next */
 239
 240       case OP_UPTO:
 241       case OP_MINUPTO:
 242       case OP_POSUPTO:
 243       set_bit(start_bits, tcode[3], caseless, cd);
 244       tcode += 4;
 245 #ifdef SUPPORT_UTF8
 246       if (utf8 && tcode[-1] >= 0xc0)
 247         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
 248 #endif
 249       break;
 250
 251       /* At least one single char sets the bit and stops */
 252
 253       case OP_EXACT:       /* Fall through */
 254       tcode += 2;
 255
 256       case OP_CHAR:
 257       case OP_CHARNC:
 258       case OP_PLUS:
 259       case OP_MINPLUS:
 260       case OP_POSPLUS:
 261       set_bit(start_bits, tcode[1], caseless, cd);
 262       try_next = FALSE;
 263       break;
 264
 265       /* Single character type sets the bits and stops */
 266
 267       case OP_NOT_DIGIT:
 268       for (c = 0; c < 32; c++)
 269         start_bits[c] |= ~cd->cbits[c+cbit_digit];
 270       try_next = FALSE;
 271       break;
 272
 273       case OP_DIGIT:
 274       for (c = 0; c < 32; c++)
 275         start_bits[c] |= cd->cbits[c+cbit_digit];
 276       try_next = FALSE;
 277       break;
 278
 279       /* The cbit_space table has vertical tab as whitespace; we have to
 280       discard it. */
 281
 282       case OP_NOT_WHITESPACE:
 283       for (c = 0; c < 32; c++)
 284         {
 285         int d = cd->cbits[c+cbit_space];
 286         if (c == 1) d &= ~0x08;
 287         start_bits[c] |= ~d;
 288         }
 289       try_next = FALSE;
 290       break;
 291
 292       /* The cbit_space table has vertical tab as whitespace; we have to
 293       discard it. */
 294
 295       case OP_WHITESPACE:
 296       for (c = 0; c < 32; c++)
 297         {
 298         int d = cd->cbits[c+cbit_space];
 299         if (c == 1) d &= ~0x08;
 300         start_bits[c] |= d;
 301         }
 302       try_next = FALSE;
 303       break;
 304
 305       case OP_NOT_WORDCHAR:
 306       for (c = 0; c < 32; c++)
 307         start_bits[c] |= ~cd->cbits[c+cbit_word];
 308       try_next = FALSE;
 309       break;
 310
 311       case OP_WORDCHAR:
 312       for (c = 0; c < 32; c++)
 313         start_bits[c] |= cd->cbits[c+cbit_word];
 314       try_next = FALSE;
 315       break;
 316
 317       /* One or more character type fudges the pointer and restarts, knowing
 318       it will hit a single character type and stop there. */
 319
 320       case OP_TYPEPLUS:
 321       case OP_TYPEMINPLUS:
 322       tcode++;
 323       break;
 324
 325       case OP_TYPEEXACT:
 326       tcode += 3;
 327       break;
 328
 329       /* Zero or more repeats of character types set the bits and then
 330       try again. */
 331
 332       case OP_TYPEUPTO:
 333       case OP_TYPEMINUPTO:
 334       case OP_TYPEPOSUPTO:
 335       tcode += 2;               /* Fall through */
 336
 337       case OP_TYPESTAR:
 338       case OP_TYPEMINSTAR:
 339       case OP_TYPEPOSSTAR:
 340       case OP_TYPEQUERY:
 341       case OP_TYPEMINQUERY:
 342       case OP_TYPEPOSQUERY:
 343       switch(tcode[1])
 344         {
 345         case OP_ANY:
 346         return SSB_FAIL;
 347
 348         case OP_NOT_DIGIT:
 349         for (c = 0; c < 32; c++)
 350           start_bits[c] |= ~cd->cbits[c+cbit_digit];
 351         break;
 352
 353         case OP_DIGIT:
 354         for (c = 0; c < 32; c++)
 355           start_bits[c] |= cd->cbits[c+cbit_digit];
 356         break;
 357
 358         /* The cbit_space table has vertical tab as whitespace; we have to
 359         discard it. */
 360
 361         case OP_NOT_WHITESPACE:
 362         for (c = 0; c < 32; c++)
 363           {
 364           int d = cd->cbits[c+cbit_space];
 365           if (c == 1) d &= ~0x08;
 366           start_bits[c] |= ~d;
 367           }
 368         break;
 369
 370         /* The cbit_space table has vertical tab as whitespace; we have to
 371         discard it. */
 372
 373         case OP_WHITESPACE:
 374         for (c = 0; c < 32; c++)
 375           {
 376           int d = cd->cbits[c+cbit_space];
 377           if (c == 1) d &= ~0x08;
 378           start_bits[c] |= d;
 379           }
 380         break;
 381
 382         case OP_NOT_WORDCHAR:
 383         for (c = 0; c < 32; c++)
 384           start_bits[c] |= ~cd->cbits[c+cbit_word];
 385         break;
 386
 387         case OP_WORDCHAR:
 388         for (c = 0; c < 32; c++)
 389           start_bits[c] |= cd->cbits[c+cbit_word];
 390         break;
 391         }
 392
 393       tcode += 2;
 394       break;
 395
 396       /* Character class where all the information is in a bit map: set the
 397       bits and either carry on or not, according to the repeat count. If it was
 398       a negative class, and we are operating with UTF-8 characters, any byte
 399       with a value >= 0xc4 is a potentially valid starter because it starts a
 400       character with a value > 255. */
 401
 402       case OP_NCLASS:
 403 #ifdef SUPPORT_UTF8
 404       if (utf8)
 405         {
 406         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
 407         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
 408         }
 409 #endif
 410       /* Fall through */
 411
 412       case OP_CLASS:
 413         {
 414         tcode++;
 415
 416         /* In UTF-8 mode, the bits in a bit map correspond to character
 417         values, not to byte values. However, the bit map we are constructing is
 418         for byte values. So we have to do a conversion for characters whose
 419         value is > 127. In fact, there are only two possible starting bytes for
 420         characters in the range 128 - 255. */
 421
 422 #ifdef SUPPORT_UTF8
 423         if (utf8)
 424           {
 425           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
 426           for (c = 128; c < 256; c++)
 427             {
 428             if ((tcode[c/8] && (1 << (c&7))) != 0)
 429               {
 430               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
 431               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
 432               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
 433               }
 434             }
 435           }
 436
 437         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
 438
 439         else
 440 #endif
 441           {
 442           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
 443           }
 444
 445         /* Advance past the bit map, and act on what follows */
 446
 447         tcode += 32;
 448         switch (*tcode)
 449           {
 450           case OP_CRSTAR:
 451           case OP_CRMINSTAR:
 452           case OP_CRQUERY:
 453           case OP_CRMINQUERY:
 454           tcode++;
 455           break;
 456
 457           case OP_CRRANGE:
 458           case OP_CRMINRANGE:
 459           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
 460             else try_next = FALSE;
 461           break;
 462
 463           default:
 464           try_next = FALSE;
 465           break;
 466           }
 467         }
 468       break; /* End of bitmap class handling */
 469
 470       }      /* End of switch */
 471     }        /* End of try_next loop */
 472
 473   code += GET(code, 1);   /* Advance to next branch */
 474   }
 475 while (*code == OP_ALT);
 476 return yield;
 477 }
 478
 479
 480
 481 /*************************************************
 482 *          Study a compiled expression           *
 483 *************************************************/
 484
 485 /* This function is handed a compiled expression that it must study to produce
 486 information that will speed up the matching. It returns a pcre_extra block
 487 which then gets handed back to pcre_exec().
 488
 489 Arguments:
 490   re        points to the compiled expression
 491   options   contains option bits
 492   errorptr  points to where to place error messages;
 493             set NULL unless error
 494
 495 Returns:    pointer to a pcre_extra block, with study_data filled in and the
 496               appropriate flag set;
 497             NULL on error or if no optimization possible
 498 */
 499
 500 PCRE_EXP_DEFN pcre_extra *
 501 pcre_study(const pcre *external_re, int options, const char **errorptr)
 502 {
 503 uschar start_bits[32];
 504 pcre_extra *extra;
 505 pcre_study_data *study;
 506 const uschar *tables;
 507 uschar *code;
 508 compile_data compile_block;
 509 const real_pcre *re = (const real_pcre *)external_re;
 510
 511 *errorptr = NULL;
 512
 513 if (re == NULL || re->magic_number != MAGIC_NUMBER)
 514   {
 515   *errorptr = "argument is not a compiled regular expression";
 516   return NULL;
 517   }
 518
 519 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
 520   {
 521   *errorptr = "unknown or incorrect option bit(s) set";
 522   return NULL;
 523   }
 524
 525 code = (uschar *)re + re->name_table_offset +
 526   (re->name_count * re->name_entry_size);
 527
 528 /* For an anchored pattern, or an unanchored pattern that has a first char, or
 529 a multiline pattern that matches only at "line starts", no further processing
 530 at present. */
 531
 532 if ((re->options & PCRE_ANCHORED) != 0 ||
 533     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
 534   return NULL;
 535
 536 /* Set the character tables in the block that is passed around */
 537
 538 tables = re->tables;
 539 if (tables == NULL)
 540   (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
 541   (void *)(&tables));
 542
 543 compile_block.lcc = tables + lcc_offset;
 544 compile_block.fcc = tables + fcc_offset;
 545 compile_block.cbits = tables + cbits_offset;
 546 compile_block.ctypes = tables + ctypes_offset;
 547
 548 /* See if we can find a fixed set of initial characters for the pattern. */
 549
 550 memset(start_bits, 0, 32 * sizeof(uschar));
 551 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
 552   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
 553
 554 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
 555 the latter, which is pointed to by the former, which may also get additional
 556 data set later by the calling program. At the moment, the size of
 557 pcre_study_data is fixed. We nevertheless save it in a field for returning via
 558 the pcre_fullinfo() function so that if it becomes variable in the future, we
 559 don't have to change that code. */
 560
 561 extra = (pcre_extra *)(pcre_malloc)
 562   (sizeof(pcre_extra) + sizeof(pcre_study_data));
 563
 564 if (extra == NULL)
 565   {
 566   *errorptr = "failed to get memory";
 567   return NULL;
 568   }
 569
 570 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
 571 extra->flags = PCRE_EXTRA_STUDY_DATA;
 572 extra->study_data = study;
 573
 574 study->size = sizeof(pcre_study_data);
 575 study->options = PCRE_STUDY_MAPPED;
 576 memcpy(study->start_bits, start_bits, sizeof(start_bits));
 577
 578 return extra;
 579 }
 580
 581 /* End of pcre_study.c */