src/src/pcre/pcre_exec.c

   1 /* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */
   2
   3 /*************************************************
   4 *      Perl-Compatible Regular Expressions       *
   5 *************************************************/
   6
   7 /* PCRE is a library of functions to support regular expressions whose syntax
   8 and semantics are as close as possible to those of the Perl 5 language.
   9
  10                        Written by Philip Hazel
  11            Copyright (c) 1997-2005 University of Cambridge
  12
  13 -----------------------------------------------------------------------------
  14 Redistribution and use in source and binary forms, with or without
  15 modification, are permitted provided that the following conditions are met:
  16
  17     * Redistributions of source code must retain the above copyright notice,
  18       this list of conditions and the following disclaimer.
  19
  20     * Redistributions in binary form must reproduce the above copyright
  21       notice, this list of conditions and the following disclaimer in the
  22       documentation and/or other materials provided with the distribution.
  23
  24     * Neither the name of the University of Cambridge nor the names of its
  25       contributors may be used to endorse or promote products derived from
  26       this software without specific prior written permission.
  27
  28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 POSSIBILITY OF SUCH DAMAGE.
  39 -----------------------------------------------------------------------------
  40 */
  41
  42
  43 /* This module contains pcre_exec(), the externally visible function that does
  44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
  45 possible. There are also some static supporting functions. */
  46
  47
  48 #include "pcre_internal.h"
  49
  50
  51 /* Structure for building a chain of data that actually lives on the
  52 stack, for holding the values of the subject pointer at the start of each
  53 subpattern, so as to detect when an empty string has been matched by a
  54 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
  55 are on the heap, not on the stack. */
  56
  57 typedef struct eptrblock {
  58   struct eptrblock *epb_prev;
  59   const uschar *epb_saved_eptr;
  60 } eptrblock;
  61
  62 /* Flag bits for the match() function */
  63
  64 #define match_condassert   0x01    /* Called to check a condition assertion */
  65 #define match_isgroup      0x02    /* Set if start of bracketed group */
  66
  67 /* Non-error returns from the match() function. Error returns are externally
  68 defined PCRE_ERROR_xxx codes, which are all negative. */
  69
  70 #define MATCH_MATCH        1
  71 #define MATCH_NOMATCH      0
  72
  73 /* Maximum number of ints of offset to save on the stack for recursive calls.
  74 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
  75 because the offset vector is always a multiple of 3 long. */
  76
  77 #define REC_STACK_SAVE_MAX 30
  78
  79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
  80
  81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
  82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
  83
  84
  85
  86 #ifdef DEBUG
  87 /*************************************************
  88 *        Debugging function to print chars       *
  89 *************************************************/
  90
  91 /* Print a sequence of chars in printable format, stopping at the end of the
  92 subject if the requested.
  93
  94 Arguments:
  95   p           points to characters
  96   length      number to print
  97   is_subject  TRUE if printing from within md->start_subject
  98   md          pointer to matching data block, if is_subject is TRUE
  99
 100 Returns:     nothing
 101 */
 102
 103 static void
 104 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 105 {
 106 int c;
 107 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 108 while (length-- > 0)
 109   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 110 }
 111 #endif
 112
 113
 114
 115 /*************************************************
 116 *          Match a back-reference                *
 117 *************************************************/
 118
 119 /* If a back reference hasn't been set, the length that is passed is greater
 120 than the number of characters left in the string, so the match fails.
 121
 122 Arguments:
 123   offset      index into the offset vector
 124   eptr        points into the subject
 125   length      length to be matched
 126   md          points to match data block
 127   ims         the ims flags
 128
 129 Returns:      TRUE if matched
 130 */
 131
 132 static BOOL
 133 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
 134   unsigned long int ims)
 135 {
 136 const uschar *p = md->start_subject + md->offset_vector[offset];
 137
 138 #ifdef DEBUG
 139 if (eptr >= md->end_subject)
 140   printf("matching subject <null>");
 141 else
 142   {
 143   printf("matching subject ");
 144   pchars(eptr, length, TRUE, md);
 145   }
 146 printf(" against backref ");
 147 pchars(p, length, FALSE, md);
 148 printf("\n");
 149 #endif
 150
 151 /* Always fail if not enough characters left */
 152
 153 if (length > md->end_subject - eptr) return FALSE;
 154
 155 /* Separate the caselesss case for speed */
 156
 157 if ((ims & PCRE_CASELESS) != 0)
 158   {
 159   while (length-- > 0)
 160     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
 161   }
 162 else
 163   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
 164
 165 return TRUE;
 166 }
 167
 168
 169
 170 /***************************************************************************
 171 ****************************************************************************
 172                    RECURSION IN THE match() FUNCTION
 173
 174 The match() function is highly recursive. Some regular expressions can cause
 175 it to recurse thousands of times. I was writing for Unix, so I just let it
 176 call itself recursively. This uses the stack for saving everything that has
 177 to be saved for a recursive call. On Unix, the stack can be large, and this
 178 works fine.
 179
 180 It turns out that on non-Unix systems there are problems with programs that
 181 use a lot of stack. (This despite the fact that every last chip has oodles
 182 of memory these days, and techniques for extending the stack have been known
 183 for decades.) So....
 184
 185 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
 186 calls by keeping local variables that need to be preserved in blocks of memory
 187 obtained from malloc instead instead of on the stack. Macros are used to
 188 achieve this so that the actual code doesn't look very different to what it
 189 always used to.
 190 ****************************************************************************
 191 ***************************************************************************/
 192
 193
 194 /* These versions of the macros use the stack, as normal */
 195
 196 #ifndef NO_RECURSE
 197 #define REGISTER register
 198 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
 199 #define RRETURN(ra) return ra
 200 #else
 201
 202
 203 /* These versions of the macros manage a private stack on the heap. Note
 204 that the rd argument of RMATCH isn't actually used. It's the md argument of
 205 match(), which never changes. */
 206
 207 #define REGISTER
 208
 209 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
 210   {\
 211   heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
 212   if (setjmp(frame->Xwhere) == 0)\
 213     {\
 214     newframe->Xeptr = ra;\
 215     newframe->Xecode = rb;\
 216     newframe->Xoffset_top = rc;\
 217     newframe->Xims = re;\
 218     newframe->Xeptrb = rf;\
 219     newframe->Xflags = rg;\
 220     newframe->Xprevframe = frame;\
 221     frame = newframe;\
 222     DPRINTF(("restarting from line %d\n", __LINE__));\
 223     goto HEAP_RECURSE;\
 224     }\
 225   else\
 226     {\
 227     DPRINTF(("longjumped back to line %d\n", __LINE__));\
 228     frame = md->thisframe;\
 229     rx = frame->Xresult;\
 230     }\
 231   }
 232
 233 #define RRETURN(ra)\
 234   {\
 235   heapframe *newframe = frame;\
 236   frame = newframe->Xprevframe;\
 237   (pcre_stack_free)(newframe);\
 238   if (frame != NULL)\
 239     {\
 240     frame->Xresult = ra;\
 241     md->thisframe = frame;\
 242     longjmp(frame->Xwhere, 1);\
 243     }\
 244   return ra;\
 245   }
 246
 247
 248 /* Structure for remembering the local variables in a private frame */
 249
 250 typedef struct heapframe {
 251   struct heapframe *Xprevframe;
 252
 253   /* Function arguments that may change */
 254
 255   const uschar *Xeptr;
 256   const uschar *Xecode;
 257   int Xoffset_top;
 258   long int Xims;
 259   eptrblock *Xeptrb;
 260   int Xflags;
 261
 262   /* Function local variables */
 263
 264   const uschar *Xcallpat;
 265   const uschar *Xcharptr;
 266   const uschar *Xdata;
 267   const uschar *Xnext;
 268   const uschar *Xpp;
 269   const uschar *Xprev;
 270   const uschar *Xsaved_eptr;
 271
 272   recursion_info Xnew_recursive;
 273
 274   BOOL Xcur_is_word;
 275   BOOL Xcondition;
 276   BOOL Xminimize;
 277   BOOL Xprev_is_word;
 278
 279   unsigned long int Xoriginal_ims;
 280
 281 #ifdef SUPPORT_UCP
 282   int Xprop_type;
 283   int Xprop_fail_result;
 284   int Xprop_category;
 285   int Xprop_chartype;
 286   int Xprop_othercase;
 287   int Xprop_test_against;
 288   int *Xprop_test_variable;
 289 #endif
 290
 291   int Xctype;
 292   int Xfc;
 293   int Xfi;
 294   int Xlength;
 295   int Xmax;
 296   int Xmin;
 297   int Xnumber;
 298   int Xoffset;
 299   int Xop;
 300   int Xsave_capture_last;
 301   int Xsave_offset1, Xsave_offset2, Xsave_offset3;
 302   int Xstacksave[REC_STACK_SAVE_MAX];
 303
 304   eptrblock Xnewptrb;
 305
 306   /* Place to pass back result, and where to jump back to */
 307
 308   int  Xresult;
 309   jmp_buf Xwhere;
 310
 311 } heapframe;
 312
 313 #endif
 314
 315
 316 /***************************************************************************
 317 ***************************************************************************/
 318
 319
 320
 321 /*************************************************
 322 *         Match from current position            *
 323 *************************************************/
 324
 325 /* On entry ecode points to the first opcode, and eptr to the first character
 326 in the subject string, while eptrb holds the value of eptr at the start of the
 327 last bracketed group - used for breaking infinite loops matching zero-length
 328 strings. This function is called recursively in many circumstances. Whenever it
 329 returns a negative (error) response, the outer incarnation must also return the
 330 same response.
 331
 332 Performance note: It might be tempting to extract commonly used fields from the
 333 md structure (e.g. utf8, end_subject) into individual variables to improve
 334 performance. Tests using gcc on a SPARC disproved this; in the first case, it
 335 made performance worse.
 336
 337 Arguments:
 338    eptr        pointer in subject
 339    ecode       position in code
 340    offset_top  current top pointer
 341    md          pointer to "static" info for the match
 342    ims         current /i, /m, and /s options
 343    eptrb       pointer to chain of blocks containing eptr at start of
 344                  brackets - for testing for empty matches
 345    flags       can contain
 346                  match_condassert - this is an assertion condition
 347                  match_isgroup - this is the start of a bracketed group
 348
 349 Returns:       MATCH_MATCH if matched            )  these values are >= 0
 350                MATCH_NOMATCH if failed to match  )
 351                a negative PCRE_ERROR_xxx value if aborted by an error condition
 352                  (e.g. stopped by recursion limit)
 353 */
 354
 355 static int
 356 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
 357   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
 358   int flags)
 359 {
 360 /* These variables do not need to be preserved over recursion in this function,
 361 so they can be ordinary variables in all cases. Mark them with "register"
 362 because they are used a lot in loops. */
 363
 364 register int  rrc;    /* Returns from recursive calls */
 365 register int  i;      /* Used for loops not involving calls to RMATCH() */
 366 register int  c;      /* Character values not kept over RMATCH() calls */
 367 register BOOL utf8;   /* Local copy of UTF-8 flag for speed */
 368
 369 /* When recursion is not being used, all "local" variables that have to be
 370 preserved over calls to RMATCH() are part of a "frame" which is obtained from
 371 heap storage. Set up the top-level frame here; others are obtained from the
 372 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
 373
 374 #ifdef NO_RECURSE
 375 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
 376 frame->Xprevframe = NULL;            /* Marks the top level */
 377
 378 /* Copy in the original argument variables */
 379
 380 frame->Xeptr = eptr;
 381 frame->Xecode = ecode;
 382 frame->Xoffset_top = offset_top;
 383 frame->Xims = ims;
 384 frame->Xeptrb = eptrb;
 385 frame->Xflags = flags;
 386
 387 /* This is where control jumps back to to effect "recursion" */
 388
 389 HEAP_RECURSE:
 390
 391 /* Macros make the argument variables come from the current frame */
 392
 393 #define eptr               frame->Xeptr
 394 #define ecode              frame->Xecode
 395 #define offset_top         frame->Xoffset_top
 396 #define ims                frame->Xims
 397 #define eptrb              frame->Xeptrb
 398 #define flags              frame->Xflags
 399
 400 /* Ditto for the local variables */
 401
 402 #ifdef SUPPORT_UTF8
 403 #define charptr            frame->Xcharptr
 404 #endif
 405 #define callpat            frame->Xcallpat
 406 #define data               frame->Xdata
 407 #define next               frame->Xnext
 408 #define pp                 frame->Xpp
 409 #define prev               frame->Xprev
 410 #define saved_eptr         frame->Xsaved_eptr
 411
 412 #define new_recursive      frame->Xnew_recursive
 413
 414 #define cur_is_word        frame->Xcur_is_word
 415 #define condition          frame->Xcondition
 416 #define minimize           frame->Xminimize
 417 #define prev_is_word       frame->Xprev_is_word
 418
 419 #define original_ims       frame->Xoriginal_ims
 420
 421 #ifdef SUPPORT_UCP
 422 #define prop_type          frame->Xprop_type
 423 #define prop_fail_result   frame->Xprop_fail_result
 424 #define prop_category      frame->Xprop_category
 425 #define prop_chartype      frame->Xprop_chartype
 426 #define prop_othercase     frame->Xprop_othercase
 427 #define prop_test_against  frame->Xprop_test_against
 428 #define prop_test_variable frame->Xprop_test_variable
 429 #endif
 430
 431 #define ctype              frame->Xctype
 432 #define fc                 frame->Xfc
 433 #define fi                 frame->Xfi
 434 #define length             frame->Xlength
 435 #define max                frame->Xmax
 436 #define min                frame->Xmin
 437 #define number             frame->Xnumber
 438 #define offset             frame->Xoffset
 439 #define op                 frame->Xop
 440 #define save_capture_last  frame->Xsave_capture_last
 441 #define save_offset1       frame->Xsave_offset1
 442 #define save_offset2       frame->Xsave_offset2
 443 #define save_offset3       frame->Xsave_offset3
 444 #define stacksave          frame->Xstacksave
 445
 446 #define newptrb            frame->Xnewptrb
 447
 448 /* When recursion is being used, local variables are allocated on the stack and
 449 get preserved during recursion in the normal way. In this environment, fi and
 450 i, and fc and c, can be the same variables. */
 451
 452 #else
 453 #define fi i
 454 #define fc c
 455
 456
 457 #ifdef SUPPORT_UTF8                /* Many of these variables are used ony */
 458 const uschar *charptr;             /* small blocks of the code. My normal  */
 459 #endif                             /* style of coding would have declared  */
 460 const uschar *callpat;             /* them within each of those blocks.    */
 461 const uschar *data;                /* However, in order to accommodate the */
 462 const uschar *next;                /* version of this code that uses an    */
 463 const uschar *pp;                  /* external "stack" implemented on the  */
 464 const uschar *prev;                /* heap, it is easier to declare them   */
 465 const uschar *saved_eptr;          /* all here, so the declarations can    */
 466                                    /* be cut out in a block. The only      */
 467 recursion_info new_recursive;      /* declarations within blocks below are */
 468                                    /* for variables that do not have to    */
 469 BOOL cur_is_word;                  /* be preserved over a recursive call   */
 470 BOOL condition;                    /* to RMATCH().                         */
 471 BOOL minimize;
 472 BOOL prev_is_word;
 473
 474 unsigned long int original_ims;
 475
 476 #ifdef SUPPORT_UCP
 477 int prop_type;
 478 int prop_fail_result;
 479 int prop_category;
 480 int prop_chartype;
 481 int prop_othercase;
 482 int prop_test_against;
 483 int *prop_test_variable;
 484 #endif
 485
 486 int ctype;
 487 int length;
 488 int max;
 489 int min;
 490 int number;
 491 int offset;
 492 int op;
 493 int save_capture_last;
 494 int save_offset1, save_offset2, save_offset3;
 495 int stacksave[REC_STACK_SAVE_MAX];
 496
 497 eptrblock newptrb;
 498 #endif
 499
 500 /* These statements are here to stop the compiler complaining about unitialized
 501 variables. */
 502
 503 #ifdef SUPPORT_UCP
 504 prop_fail_result = 0;
 505 prop_test_against = 0;
 506 prop_test_variable = NULL;
 507 #endif
 508
 509 /* OK, now we can get on with the real code of the function. Recursion is
 510 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
 511 these just turn into a recursive call to match() and a "return", respectively.
 512 However, RMATCH isn't like a function call because it's quite a complicated
 513 macro. It has to be used in one particular way. This shouldn't, however, impact
 514 performance when true recursion is being used. */
 515
 516 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
 517
 518 original_ims = ims;    /* Save for resetting on ')' */
 519 utf8 = md->utf8;       /* Local copy of the flag */
 520
 521 /* At the start of a bracketed group, add the current subject pointer to the
 522 stack of such pointers, to be re-instated at the end of the group when we hit
 523 the closing ket. When match() is called in other circumstances, we don't add to
 524 this stack. */
 525
 526 if ((flags & match_isgroup) != 0)
 527   {
 528   newptrb.epb_prev = eptrb;
 529   newptrb.epb_saved_eptr = eptr;
 530   eptrb = &newptrb;
 531   }
 532
 533 /* Now start processing the operations. */
 534
 535 for (;;)
 536   {
 537   op = *ecode;
 538   minimize = FALSE;
 539
 540   /* For partial matching, remember if we ever hit the end of the subject after
 541   matching at least one subject character. */
 542
 543   if (md->partial &&
 544       eptr >= md->end_subject &&
 545       eptr > md->start_match)
 546     md->hitend = TRUE;
 547
 548   /* Opening capturing bracket. If there is space in the offset vector, save
 549   the current subject position in the working slot at the top of the vector. We
 550   mustn't change the current values of the data slot, because they may be set
 551   from a previous iteration of this group, and be referred to by a reference
 552   inside the group.
 553
 554   If the bracket fails to match, we need to restore this value and also the
 555   values of the final offsets, in case they were set by a previous iteration of
 556   the same bracket.
 557
 558   If there isn't enough space in the offset vector, treat this as if it were a
 559   non-capturing bracket. Don't worry about setting the flag for the error case
 560   here; that is handled in the code for KET. */
 561
 562   if (op > OP_BRA)
 563     {
 564     number = op - OP_BRA;
 565
 566     /* For extended extraction brackets (large number), we have to fish out the
 567     number from a dummy opcode at the start. */
 568
 569     if (number > EXTRACT_BASIC_MAX)
 570       number = GET2(ecode, 2+LINK_SIZE);
 571     offset = number << 1;
 572
 573 #ifdef DEBUG
 574     printf("start bracket %d subject=", number);
 575     pchars(eptr, 16, TRUE, md);
 576     printf("\n");
 577 #endif
 578
 579     if (offset < md->offset_max)
 580       {
 581       save_offset1 = md->offset_vector[offset];
 582       save_offset2 = md->offset_vector[offset+1];
 583       save_offset3 = md->offset_vector[md->offset_end - number];
 584       save_capture_last = md->capture_last;
 585
 586       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
 587       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
 588
 589       do
 590         {
 591         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
 592           match_isgroup);
 593         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 594         md->capture_last = save_capture_last;
 595         ecode += GET(ecode, 1);
 596         }
 597       while (*ecode == OP_ALT);
 598
 599       DPRINTF(("bracket %d failed\n", number));
 600
 601       md->offset_vector[offset] = save_offset1;
 602       md->offset_vector[offset+1] = save_offset2;
 603       md->offset_vector[md->offset_end - number] = save_offset3;
 604
 605       RRETURN(MATCH_NOMATCH);
 606       }
 607
 608     /* Insufficient room for saving captured contents */
 609
 610     else op = OP_BRA;
 611     }
 612
 613   /* Other types of node can be handled by a switch */
 614
 615   switch(op)
 616     {
 617     case OP_BRA:     /* Non-capturing bracket: optimized */
 618     DPRINTF(("start bracket 0\n"));
 619     do
 620       {
 621       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
 622         match_isgroup);
 623       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 624       ecode += GET(ecode, 1);
 625       }
 626     while (*ecode == OP_ALT);
 627     DPRINTF(("bracket 0 failed\n"));
 628     RRETURN(MATCH_NOMATCH);
 629
 630     /* Conditional group: compilation checked that there are no more than
 631     two branches. If the condition is false, skipping the first branch takes us
 632     past the end if there is only one branch, but that's OK because that is
 633     exactly what going to the ket would do. */
 634
 635     case OP_COND:
 636     if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
 637       {
 638       offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
 639       condition = (offset == CREF_RECURSE * 2)?
 640         (md->recursive != NULL) :
 641         (offset < offset_top && md->offset_vector[offset] >= 0);
 642       RMATCH(rrc, eptr, ecode + (condition?
 643         (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
 644         offset_top, md, ims, eptrb, match_isgroup);
 645       RRETURN(rrc);
 646       }
 647
 648     /* The condition is an assertion. Call match() to evaluate it - setting
 649     the final argument TRUE causes it to stop at the end of an assertion. */
 650
 651     else
 652       {
 653       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
 654           match_condassert | match_isgroup);
 655       if (rrc == MATCH_MATCH)
 656         {
 657         ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
 658         while (*ecode == OP_ALT) ecode += GET(ecode, 1);
 659         }
 660       else if (rrc != MATCH_NOMATCH)
 661         {
 662         RRETURN(rrc);         /* Need braces because of following else */
 663         }
 664       else ecode += GET(ecode, 1);
 665       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
 666         match_isgroup);
 667       RRETURN(rrc);
 668       }
 669     /* Control never reaches here */
 670
 671     /* Skip over conditional reference or large extraction number data if
 672     encountered. */
 673
 674     case OP_CREF:
 675     case OP_BRANUMBER:
 676     ecode += 3;
 677     break;
 678
 679     /* End of the pattern. If we are in a recursion, we should restore the
 680     offsets appropriately and continue from after the call. */
 681
 682     case OP_END:
 683     if (md->recursive != NULL && md->recursive->group_num == 0)
 684       {
 685       recursion_info *rec = md->recursive;
 686       DPRINTF(("Hit the end in a (?0) recursion\n"));
 687       md->recursive = rec->prevrec;
 688       memmove(md->offset_vector, rec->offset_save,
 689         rec->saved_max * sizeof(int));
 690       md->start_match = rec->save_start;
 691       ims = original_ims;
 692       ecode = rec->after_call;
 693       break;
 694       }
 695
 696     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
 697     string - backtracking will then try other alternatives, if any. */
 698
 699     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
 700     md->end_match_ptr = eptr;          /* Record where we ended */
 701     md->end_offset_top = offset_top;   /* and how many extracts were taken */
 702     RRETURN(MATCH_MATCH);
 703
 704     /* Change option settings */
 705
 706     case OP_OPT:
 707     ims = ecode[1];
 708     ecode += 2;
 709     DPRINTF(("ims set to %02lx\n", ims));
 710     break;
 711
 712     /* Assertion brackets. Check the alternative branches in turn - the
 713     matching won't pass the KET for an assertion. If any one branch matches,
 714     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
 715     start of each branch to move the current point backwards, so the code at
 716     this level is identical to the lookahead case. */
 717
 718     case OP_ASSERT:
 719     case OP_ASSERTBACK:
 720     do
 721       {
 722       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
 723         match_isgroup);
 724       if (rrc == MATCH_MATCH) break;
 725       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 726       ecode += GET(ecode, 1);
 727       }
 728     while (*ecode == OP_ALT);
 729     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
 730
 731     /* If checking an assertion for a condition, return MATCH_MATCH. */
 732
 733     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
 734
 735     /* Continue from after the assertion, updating the offsets high water
 736     mark, since extracts may have been taken during the assertion. */
 737
 738     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
 739     ecode += 1 + LINK_SIZE;
 740     offset_top = md->end_offset_top;
 741     continue;
 742
 743     /* Negative assertion: all branches must fail to match */
 744
 745     case OP_ASSERT_NOT:
 746     case OP_ASSERTBACK_NOT:
 747     do
 748       {
 749       RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
 750         match_isgroup);
 751       if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
 752       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 753       ecode += GET(ecode,1);
 754       }
 755     while (*ecode == OP_ALT);
 756
 757     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
 758
 759     ecode += 1 + LINK_SIZE;
 760     continue;
 761
 762     /* Move the subject pointer back. This occurs only at the start of
 763     each branch of a lookbehind assertion. If we are too close to the start to
 764     move back, this match function fails. When working with UTF-8 we move
 765     back a number of characters, not bytes. */
 766
 767     case OP_REVERSE:
 768 #ifdef SUPPORT_UTF8
 769     if (utf8)
 770       {
 771       c = GET(ecode,1);
 772       for (i = 0; i < c; i++)
 773         {
 774         eptr--;
 775         if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
 776         BACKCHAR(eptr)
 777         }
 778       }
 779     else
 780 #endif
 781
 782     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
 783
 784       {
 785       eptr -= GET(ecode,1);
 786       if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
 787       }
 788
 789     /* Skip to next op code */
 790
 791     ecode += 1 + LINK_SIZE;
 792     break;
 793
 794     /* The callout item calls an external function, if one is provided, passing
 795     details of the match so far. This is mainly for debugging, though the
 796     function is able to force a failure. */
 797
 798     case OP_CALLOUT:
 799     if (pcre_callout != NULL)
 800       {
 801       pcre_callout_block cb;
 802       cb.version          = 1;   /* Version 1 of the callout block */
 803       cb.callout_number   = ecode[1];
 804       cb.offset_vector    = md->offset_vector;
 805       cb.subject          = (const char *)md->start_subject;
 806       cb.subject_length   = md->end_subject - md->start_subject;
 807       cb.start_match      = md->start_match - md->start_subject;
 808       cb.current_position = eptr - md->start_subject;
 809       cb.pattern_position = GET(ecode, 2);
 810       cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
 811       cb.capture_top      = offset_top/2;
 812       cb.capture_last     = md->capture_last;
 813       cb.callout_data     = md->callout_data;
 814       if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
 815       if (rrc < 0) RRETURN(rrc);
 816       }
 817     ecode += 2 + 2*LINK_SIZE;
 818     break;
 819
 820     /* Recursion either matches the current regex, or some subexpression. The
 821     offset data is the offset to the starting bracket from the start of the
 822     whole pattern. (This is so that it works from duplicated subpatterns.)
 823
 824     If there are any capturing brackets started but not finished, we have to
 825     save their starting points and reinstate them after the recursion. However,
 826     we don't know how many such there are (offset_top records the completed
 827     total) so we just have to save all the potential data. There may be up to
 828     65535 such values, which is too large to put on the stack, but using malloc
 829     for small numbers seems expensive. As a compromise, the stack is used when
 830     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
 831     is used. A problem is what to do if the malloc fails ... there is no way of
 832     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
 833     values on the stack, and accept that the rest may be wrong.
 834
 835     There are also other values that have to be saved. We use a chained
 836     sequence of blocks that actually live on the stack. Thanks to Robin Houston
 837     for the original version of this logic. */
 838
 839     case OP_RECURSE:
 840       {
 841       callpat = md->start_code + GET(ecode, 1);
 842       new_recursive.group_num = *callpat - OP_BRA;
 843
 844       /* For extended extraction brackets (large number), we have to fish out
 845       the number from a dummy opcode at the start. */
 846
 847       if (new_recursive.group_num > EXTRACT_BASIC_MAX)
 848         new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
 849
 850       /* Add to "recursing stack" */
 851
 852       new_recursive.prevrec = md->recursive;
 853       md->recursive = &new_recursive;
 854
 855       /* Find where to continue from afterwards */
 856
 857       ecode += 1 + LINK_SIZE;
 858       new_recursive.after_call = ecode;
 859
 860       /* Now save the offset data. */
 861
 862       new_recursive.saved_max = md->offset_end;
 863       if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
 864         new_recursive.offset_save = stacksave;
 865       else
 866         {
 867         new_recursive.offset_save =
 868           (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
 869         if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
 870         }
 871
 872       memcpy(new_recursive.offset_save, md->offset_vector,
 873             new_recursive.saved_max * sizeof(int));
 874       new_recursive.save_start = md->start_match;
 875       md->start_match = eptr;
 876
 877       /* OK, now we can do the recursion. For each top-level alternative we
 878       restore the offset and recursion data. */
 879
 880       DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
 881       do
 882         {
 883         RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
 884             eptrb, match_isgroup);
 885         if (rrc == MATCH_MATCH)
 886           {
 887           md->recursive = new_recursive.prevrec;
 888           if (new_recursive.offset_save != stacksave)
 889             (pcre_free)(new_recursive.offset_save);
 890           RRETURN(MATCH_MATCH);
 891           }
 892         else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 893
 894         md->recursive = &new_recursive;
 895         memcpy(md->offset_vector, new_recursive.offset_save,
 896             new_recursive.saved_max * sizeof(int));
 897         callpat += GET(callpat, 1);
 898         }
 899       while (*callpat == OP_ALT);
 900
 901       DPRINTF(("Recursion didn't match\n"));
 902       md->recursive = new_recursive.prevrec;
 903       if (new_recursive.offset_save != stacksave)
 904         (pcre_free)(new_recursive.offset_save);
 905       RRETURN(MATCH_NOMATCH);
 906       }
 907     /* Control never reaches here */
 908
 909     /* "Once" brackets are like assertion brackets except that after a match,
 910     the point in the subject string is not moved back. Thus there can never be
 911     a move back into the brackets. Friedl calls these "atomic" subpatterns.
 912     Check the alternative branches in turn - the matching won't pass the KET
 913     for this kind of subpattern. If any one branch matches, we carry on as at
 914     the end of a normal bracket, leaving the subject pointer. */
 915
 916     case OP_ONCE:
 917       {
 918       prev = ecode;
 919       saved_eptr = eptr;
 920
 921       do
 922         {
 923         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
 924           eptrb, match_isgroup);
 925         if (rrc == MATCH_MATCH) break;
 926         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 927         ecode += GET(ecode,1);
 928         }
 929       while (*ecode == OP_ALT);
 930
 931       /* If hit the end of the group (which could be repeated), fail */
 932
 933       if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
 934
 935       /* Continue as from after the assertion, updating the offsets high water
 936       mark, since extracts may have been taken. */
 937
 938       do ecode += GET(ecode,1); while (*ecode == OP_ALT);
 939
 940       offset_top = md->end_offset_top;
 941       eptr = md->end_match_ptr;
 942
 943       /* For a non-repeating ket, just continue at this level. This also
 944       happens for a repeating ket if no characters were matched in the group.
 945       This is the forcible breaking of infinite loops as implemented in Perl
 946       5.005. If there is an options reset, it will get obeyed in the normal
 947       course of events. */
 948
 949       if (*ecode == OP_KET || eptr == saved_eptr)
 950         {
 951         ecode += 1+LINK_SIZE;
 952         break;
 953         }
 954
 955       /* The repeating kets try the rest of the pattern or restart from the
 956       preceding bracket, in the appropriate order. We need to reset any options
 957       that changed within the bracket before re-running it, so check the next
 958       opcode. */
 959
 960       if (ecode[1+LINK_SIZE] == OP_OPT)
 961         {
 962         ims = (ims & ~PCRE_IMS) | ecode[4];
 963         DPRINTF(("ims set to %02lx at group repeat\n", ims));
 964         }
 965
 966       if (*ecode == OP_KETRMIN)
 967         {
 968         RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
 969         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 970         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
 971         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 972         }
 973       else  /* OP_KETRMAX */
 974         {
 975         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
 976         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 977         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
 978         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
 979         }
 980       }
 981     RRETURN(MATCH_NOMATCH);
 982
 983     /* An alternation is the end of a branch; scan along to find the end of the
 984     bracketed group and go to there. */
 985
 986     case OP_ALT:
 987     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
 988     break;
 989
 990     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
 991     that it may occur zero times. It may repeat infinitely, or not at all -
 992     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
 993     repeat limits are compiled as a number of copies, with the optional ones
 994     preceded by BRAZERO or BRAMINZERO. */
 995
 996     case OP_BRAZERO:
 997       {
 998       next = ecode+1;
 999       RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
1000       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1001       do next += GET(next,1); while (*next == OP_ALT);
1002       ecode = next + 1+LINK_SIZE;
1003       }
1004     break;
1005
1006     case OP_BRAMINZERO:
1007       {
1008       next = ecode+1;
1009       do next += GET(next,1); while (*next == OP_ALT);
1010       RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1011         match_isgroup);
1012       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1013       ecode++;
1014       }
1015     break;
1016
1017     /* End of a group, repeated or non-repeating. If we are at the end of
1018     an assertion "group", stop matching and return MATCH_MATCH, but record the
1019     current high water mark for use by positive assertions. Do this also
1020     for the "once" (not-backup up) groups. */
1021
1022     case OP_KET:
1023     case OP_KETRMIN:
1024     case OP_KETRMAX:
1025       {
1026       prev = ecode - GET(ecode, 1);
1027       saved_eptr = eptrb->epb_saved_eptr;
1028
1029       /* Back up the stack of bracket start pointers. */
1030
1031       eptrb = eptrb->epb_prev;
1032
1033       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1034           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1035           *prev == OP_ONCE)
1036         {
1037         md->end_match_ptr = eptr;      /* For ONCE */
1038         md->end_offset_top = offset_top;
1039         RRETURN(MATCH_MATCH);
1040         }
1041
1042       /* In all other cases except a conditional group we have to check the
1043       group number back at the start and if necessary complete handling an
1044       extraction by setting the offsets and bumping the high water mark. */
1045
1046       if (*prev != OP_COND)
1047         {
1048         number = *prev - OP_BRA;
1049
1050         /* For extended extraction brackets (large number), we have to fish out
1051         the number from a dummy opcode at the start. */
1052
1053         if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1054         offset = number << 1;
1055
1056 #ifdef DEBUG
1057         printf("end bracket %d", number);
1058         printf("\n");
1059 #endif
1060
1061         /* Test for a numbered group. This includes groups called as a result
1062         of recursion. Note that whole-pattern recursion is coded as a recurse
1063         into group 0, so it won't be picked up here. Instead, we catch it when
1064         the OP_END is reached. */
1065
1066         if (number > 0)
1067           {
1068           md->capture_last = number;
1069           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1070             {
1071             md->offset_vector[offset] =
1072               md->offset_vector[md->offset_end - number];
1073             md->offset_vector[offset+1] = eptr - md->start_subject;
1074             if (offset_top <= offset) offset_top = offset + 2;
1075             }
1076
1077           /* Handle a recursively called group. Restore the offsets
1078           appropriately and continue from after the call. */
1079
1080           if (md->recursive != NULL && md->recursive->group_num == number)
1081             {
1082             recursion_info *rec = md->recursive;
1083             DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1084             md->recursive = rec->prevrec;
1085             md->start_match = rec->save_start;
1086             memcpy(md->offset_vector, rec->offset_save,
1087               rec->saved_max * sizeof(int));
1088             ecode = rec->after_call;
1089             ims = original_ims;
1090             break;
1091             }
1092           }
1093         }
1094
1095       /* Reset the value of the ims flags, in case they got changed during
1096       the group. */
1097
1098       ims = original_ims;
1099       DPRINTF(("ims reset to %02lx\n", ims));
1100
1101       /* For a non-repeating ket, just continue at this level. This also
1102       happens for a repeating ket if no characters were matched in the group.
1103       This is the forcible breaking of infinite loops as implemented in Perl
1104       5.005. If there is an options reset, it will get obeyed in the normal
1105       course of events. */
1106
1107       if (*ecode == OP_KET || eptr == saved_eptr)
1108         {
1109         ecode += 1 + LINK_SIZE;
1110         break;
1111         }
1112
1113       /* The repeating kets try the rest of the pattern or restart from the
1114       preceding bracket, in the appropriate order. */
1115
1116       if (*ecode == OP_KETRMIN)
1117         {
1118         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1119         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1121         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1122         }
1123       else  /* OP_KETRMAX */
1124         {
1125         RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1126         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1127         RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1128         if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1129         }
1130       }
1131
1132     RRETURN(MATCH_NOMATCH);
1133
1134     /* Start of subject unless notbol, or after internal newline if multiline */
1135
1136     case OP_CIRC:
1137     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1138     if ((ims & PCRE_MULTILINE) != 0)
1139       {
1140       if (eptr != md->start_subject && eptr[-1] != NEWLINE)
1141         RRETURN(MATCH_NOMATCH);
1142       ecode++;
1143       break;
1144       }
1145     /* ... else fall through */
1146
1147     /* Start of subject assertion */
1148
1149     case OP_SOD:
1150     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1151     ecode++;
1152     break;
1153
1154     /* Start of match assertion */
1155
1156     case OP_SOM:
1157     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1158     ecode++;
1159     break;
1160
1161     /* Assert before internal newline if multiline, or before a terminating
1162     newline unless endonly is set, else end of subject unless noteol is set. */
1163
1164     case OP_DOLL:
1165     if ((ims & PCRE_MULTILINE) != 0)
1166       {
1167       if (eptr < md->end_subject)
1168         { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
1169       else
1170         { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1171       ecode++;
1172       break;
1173       }
1174     else
1175       {
1176       if (md->noteol) RRETURN(MATCH_NOMATCH);
1177       if (!md->endonly)
1178         {
1179         if (eptr < md->end_subject - 1 ||
1180            (eptr == md->end_subject - 1 && *eptr != NEWLINE))
1181           RRETURN(MATCH_NOMATCH);
1182         ecode++;
1183         break;
1184         }
1185       }
1186     /* ... else fall through */
1187
1188     /* End of subject assertion (\z) */
1189
1190     case OP_EOD:
1191     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1192     ecode++;
1193     break;
1194
1195     /* End of subject or ending \n assertion (\Z) */
1196
1197     case OP_EODN:
1198     if (eptr < md->end_subject - 1 ||
1199        (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
1200     ecode++;
1201     break;
1202
1203     /* Word boundary assertions */
1204
1205     case OP_NOT_WORD_BOUNDARY:
1206     case OP_WORD_BOUNDARY:
1207       {
1208
1209       /* Find out if the previous and current characters are "word" characters.
1210       It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1211       be "non-word" characters. */
1212
1213 #ifdef SUPPORT_UTF8
1214       if (utf8)
1215         {
1216         if (eptr == md->start_subject) prev_is_word = FALSE; else
1217           {
1218           const uschar *lastptr = eptr - 1;
1219           while((*lastptr & 0xc0) == 0x80) lastptr--;
1220           GETCHAR(c, lastptr);
1221           prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1222           }
1223         if (eptr >= md->end_subject) cur_is_word = FALSE; else
1224           {
1225           GETCHAR(c, eptr);
1226           cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1227           }
1228         }
1229       else
1230 #endif
1231
1232       /* More streamlined when not in UTF-8 mode */
1233
1234         {
1235         prev_is_word = (eptr != md->start_subject) &&
1236           ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1237         cur_is_word = (eptr < md->end_subject) &&
1238           ((md->ctypes[*eptr] & ctype_word) != 0);
1239         }
1240
1241       /* Now see if the situation is what we want */
1242
1243       if ((*ecode++ == OP_WORD_BOUNDARY)?
1244            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1245         RRETURN(MATCH_NOMATCH);
1246       }
1247     break;
1248
1249     /* Match a single character type; inline for speed */
1250
1251     case OP_ANY:
1252     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
1253       RRETURN(MATCH_NOMATCH);
1254     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1255 #ifdef SUPPORT_UTF8
1256     if (utf8)
1257       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1258 #endif
1259     ecode++;
1260     break;
1261
1262     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1263     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1264
1265     case OP_ANYBYTE:
1266     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1267     ecode++;
1268     break;
1269
1270     case OP_NOT_DIGIT:
1271     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1272     GETCHARINCTEST(c, eptr);
1273     if (
1274 #ifdef SUPPORT_UTF8
1275        c < 256 &&
1276 #endif
1277        (md->ctypes[c] & ctype_digit) != 0
1278        )
1279       RRETURN(MATCH_NOMATCH);
1280     ecode++;
1281     break;
1282
1283     case OP_DIGIT:
1284     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1285     GETCHARINCTEST(c, eptr);
1286     if (
1287 #ifdef SUPPORT_UTF8
1288        c >= 256 ||
1289 #endif
1290        (md->ctypes[c] & ctype_digit) == 0
1291        )
1292       RRETURN(MATCH_NOMATCH);
1293     ecode++;
1294     break;
1295
1296     case OP_NOT_WHITESPACE:
1297     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1298     GETCHARINCTEST(c, eptr);
1299     if (
1300 #ifdef SUPPORT_UTF8
1301        c < 256 &&
1302 #endif
1303        (md->ctypes[c] & ctype_space) != 0
1304        )
1305       RRETURN(MATCH_NOMATCH);
1306     ecode++;
1307     break;
1308
1309     case OP_WHITESPACE:
1310     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1311     GETCHARINCTEST(c, eptr);
1312     if (
1313 #ifdef SUPPORT_UTF8
1314        c >= 256 ||
1315 #endif
1316        (md->ctypes[c] & ctype_space) == 0
1317        )
1318       RRETURN(MATCH_NOMATCH);
1319     ecode++;
1320     break;
1321
1322     case OP_NOT_WORDCHAR:
1323     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1324     GETCHARINCTEST(c, eptr);
1325     if (
1326 #ifdef SUPPORT_UTF8
1327        c < 256 &&
1328 #endif
1329        (md->ctypes[c] & ctype_word) != 0
1330        )
1331       RRETURN(MATCH_NOMATCH);
1332     ecode++;
1333     break;
1334
1335     case OP_WORDCHAR:
1336     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1337     GETCHARINCTEST(c, eptr);
1338     if (
1339 #ifdef SUPPORT_UTF8
1340        c >= 256 ||
1341 #endif
1342        (md->ctypes[c] & ctype_word) == 0
1343        )
1344       RRETURN(MATCH_NOMATCH);
1345     ecode++;
1346     break;
1347
1348 #ifdef SUPPORT_UCP
1349     /* Check the next character by Unicode property. We will get here only
1350     if the support is in the binary; otherwise a compile-time error occurs. */
1351
1352     case OP_PROP:
1353     case OP_NOTPROP:
1354     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1355     GETCHARINCTEST(c, eptr);
1356       {
1357       int chartype, rqdtype;
1358       int othercase;
1359       int category = ucp_findchar(c, &chartype, &othercase);
1360
1361       rqdtype = *(++ecode);
1362       ecode++;
1363
1364       if (rqdtype >= 128)
1365         {
1366         if ((rqdtype - 128 != category) == (op == OP_PROP))
1367           RRETURN(MATCH_NOMATCH);
1368         }
1369       else
1370         {
1371         if ((rqdtype != chartype) == (op == OP_PROP))
1372           RRETURN(MATCH_NOMATCH);
1373         }
1374       }
1375     break;
1376
1377     /* Match an extended Unicode sequence. We will get here only if the support
1378     is in the binary; otherwise a compile-time error occurs. */
1379
1380     case OP_EXTUNI:
1381     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1382     GETCHARINCTEST(c, eptr);
1383       {
1384       int chartype;
1385       int othercase;
1386       int category = ucp_findchar(c, &chartype, &othercase);
1387       if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1388       while (eptr < md->end_subject)
1389         {
1390         int len = 1;
1391         if (!utf8) c = *eptr; else
1392           {
1393           GETCHARLEN(c, eptr, len);
1394           }
1395         category = ucp_findchar(c, &chartype, &othercase);
1396         if (category != ucp_M) break;
1397         eptr += len;
1398         }
1399       }
1400     ecode++;
1401     break;
1402 #endif
1403
1404
1405     /* Match a back reference, possibly repeatedly. Look past the end of the
1406     item to see if there is repeat information following. The code is similar
1407     to that for character classes, but repeated for efficiency. Then obey
1408     similar code to character type repeats - written out again for speed.
1409     However, if the referenced string is the empty string, always treat
1410     it as matched, any number of times (otherwise there could be infinite
1411     loops). */
1412
1413     case OP_REF:
1414       {
1415       offset = GET2(ecode, 1) << 1;               /* Doubled ref number */
1416       ecode += 3;                                 /* Advance past item */
1417
1418       /* If the reference is unset, set the length to be longer than the amount
1419       of subject left; this ensures that every attempt at a match fails. We
1420       can't just fail here, because of the possibility of quantifiers with zero
1421       minima. */
1422
1423       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1424         md->end_subject - eptr + 1 :
1425         md->offset_vector[offset+1] - md->offset_vector[offset];
1426
1427       /* Set up for repetition, or handle the non-repeated case */
1428
1429       switch (*ecode)
1430         {
1431         case OP_CRSTAR:
1432         case OP_CRMINSTAR:
1433         case OP_CRPLUS:
1434         case OP_CRMINPLUS:
1435         case OP_CRQUERY:
1436         case OP_CRMINQUERY:
1437         c = *ecode++ - OP_CRSTAR;
1438         minimize = (c & 1) != 0;
1439         min = rep_min[c];                 /* Pick up values from tables; */
1440         max = rep_max[c];                 /* zero for max => infinity */
1441         if (max == 0) max = INT_MAX;
1442         break;
1443
1444         case OP_CRRANGE:
1445         case OP_CRMINRANGE:
1446         minimize = (*ecode == OP_CRMINRANGE);
1447         min = GET2(ecode, 1);
1448         max = GET2(ecode, 3);
1449         if (max == 0) max = INT_MAX;
1450         ecode += 5;
1451         break;
1452
1453         default:               /* No repeat follows */
1454         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1455         eptr += length;
1456         continue;              /* With the main loop */
1457         }
1458
1459       /* If the length of the reference is zero, just continue with the
1460       main loop. */
1461
1462       if (length == 0) continue;
1463
1464       /* First, ensure the minimum number of matches are present. We get back
1465       the length of the reference string explicitly rather than passing the
1466       address of eptr, so that eptr can be a register variable. */
1467
1468       for (i = 1; i <= min; i++)
1469         {
1470         if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1471         eptr += length;
1472         }
1473
1474       /* If min = max, continue at the same level without recursion.
1475       They are not both allowed to be zero. */
1476
1477       if (min == max) continue;
1478
1479       /* If minimizing, keep trying and advancing the pointer */
1480
1481       if (minimize)
1482         {
1483         for (fi = min;; fi++)
1484           {
1485           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1486           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1487           if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1488             RRETURN(MATCH_NOMATCH);
1489           eptr += length;
1490           }
1491         /* Control never gets here */
1492         }
1493
1494       /* If maximizing, find the longest string and work backwards */
1495
1496       else
1497         {
1498         pp = eptr;
1499         for (i = min; i < max; i++)
1500           {
1501           if (!match_ref(offset, eptr, length, md, ims)) break;
1502           eptr += length;
1503           }
1504         while (eptr >= pp)
1505           {
1506           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1507           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1508           eptr -= length;
1509           }
1510         RRETURN(MATCH_NOMATCH);
1511         }
1512       }
1513     /* Control never gets here */
1514
1515
1516
1517     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1518     used when all the characters in the class have values in the range 0-255,
1519     and either the matching is caseful, or the characters are in the range
1520     0-127 when UTF-8 processing is enabled. The only difference between
1521     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1522     encountered.
1523
1524     First, look past the end of the item to see if there is repeat information
1525     following. Then obey similar code to character type repeats - written out
1526     again for speed. */
1527
1528     case OP_NCLASS:
1529     case OP_CLASS:
1530       {
1531       data = ecode + 1;                /* Save for matching */
1532       ecode += 33;                     /* Advance past the item */
1533
1534       switch (*ecode)
1535         {
1536         case OP_CRSTAR:
1537         case OP_CRMINSTAR:
1538         case OP_CRPLUS:
1539         case OP_CRMINPLUS:
1540         case OP_CRQUERY:
1541         case OP_CRMINQUERY:
1542         c = *ecode++ - OP_CRSTAR;
1543         minimize = (c & 1) != 0;
1544         min = rep_min[c];                 /* Pick up values from tables; */
1545         max = rep_max[c];                 /* zero for max => infinity */
1546         if (max == 0) max = INT_MAX;
1547         break;
1548
1549         case OP_CRRANGE:
1550         case OP_CRMINRANGE:
1551         minimize = (*ecode == OP_CRMINRANGE);
1552         min = GET2(ecode, 1);
1553         max = GET2(ecode, 3);
1554         if (max == 0) max = INT_MAX;
1555         ecode += 5;
1556         break;
1557
1558         default:               /* No repeat follows */
1559         min = max = 1;
1560         break;
1561         }
1562
1563       /* First, ensure the minimum number of matches are present. */
1564
1565 #ifdef SUPPORT_UTF8
1566       /* UTF-8 mode */
1567       if (utf8)
1568         {
1569         for (i = 1; i <= min; i++)
1570           {
1571           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1572           GETCHARINC(c, eptr);
1573           if (c > 255)
1574             {
1575             if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1576             }
1577           else
1578             {
1579             if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1580             }
1581           }
1582         }
1583       else
1584 #endif
1585       /* Not UTF-8 mode */
1586         {
1587         for (i = 1; i <= min; i++)
1588           {
1589           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1590           c = *eptr++;
1591           if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1592           }
1593         }
1594
1595       /* If max == min we can continue with the main loop without the
1596       need to recurse. */
1597
1598       if (min == max) continue;
1599
1600       /* If minimizing, keep testing the rest of the expression and advancing
1601       the pointer while it matches the class. */
1602
1603       if (minimize)
1604         {
1605 #ifdef SUPPORT_UTF8
1606         /* UTF-8 mode */
1607         if (utf8)
1608           {
1609           for (fi = min;; fi++)
1610             {
1611             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1612             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1613             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1614             GETCHARINC(c, eptr);
1615             if (c > 255)
1616               {
1617               if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1618               }
1619             else
1620               {
1621               if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1622               }
1623             }
1624           }
1625         else
1626 #endif
1627         /* Not UTF-8 mode */
1628           {
1629           for (fi = min;; fi++)
1630             {
1631             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1632             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1633             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1634             c = *eptr++;
1635             if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1636             }
1637           }
1638         /* Control never gets here */
1639         }
1640
1641       /* If maximizing, find the longest possible run, then work backwards. */
1642
1643       else
1644         {
1645         pp = eptr;
1646
1647 #ifdef SUPPORT_UTF8
1648         /* UTF-8 mode */
1649         if (utf8)
1650           {
1651           for (i = min; i < max; i++)
1652             {
1653             int len = 1;
1654             if (eptr >= md->end_subject) break;
1655             GETCHARLEN(c, eptr, len);
1656             if (c > 255)
1657               {
1658               if (op == OP_CLASS) break;
1659               }
1660             else
1661               {
1662               if ((data[c/8] & (1 << (c&7))) == 0) break;
1663               }
1664             eptr += len;
1665             }
1666           for (;;)
1667             {
1668             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1669             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1670             if (eptr-- == pp) break;        /* Stop if tried at original pos */
1671             BACKCHAR(eptr);
1672             }
1673           }
1674         else
1675 #endif
1676           /* Not UTF-8 mode */
1677           {
1678           for (i = min; i < max; i++)
1679             {
1680             if (eptr >= md->end_subject) break;
1681             c = *eptr;
1682             if ((data[c/8] & (1 << (c&7))) == 0) break;
1683             eptr++;
1684             }
1685           while (eptr >= pp)
1686             {
1687             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1688             eptr--;
1689             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1690             }
1691           }
1692
1693         RRETURN(MATCH_NOMATCH);
1694         }
1695       }
1696     /* Control never gets here */
1697
1698
1699     /* Match an extended character class. This opcode is encountered only
1700     in UTF-8 mode, because that's the only time it is compiled. */
1701
1702 #ifdef SUPPORT_UTF8
1703     case OP_XCLASS:
1704       {
1705       data = ecode + 1 + LINK_SIZE;                /* Save for matching */
1706       ecode += GET(ecode, 1);                      /* Advance past the item */
1707
1708       switch (*ecode)
1709         {
1710         case OP_CRSTAR:
1711         case OP_CRMINSTAR:
1712         case OP_CRPLUS:
1713         case OP_CRMINPLUS:
1714         case OP_CRQUERY:
1715         case OP_CRMINQUERY:
1716         c = *ecode++ - OP_CRSTAR;
1717         minimize = (c & 1) != 0;
1718         min = rep_min[c];                 /* Pick up values from tables; */
1719         max = rep_max[c];                 /* zero for max => infinity */
1720         if (max == 0) max = INT_MAX;
1721         break;
1722
1723         case OP_CRRANGE:
1724         case OP_CRMINRANGE:
1725         minimize = (*ecode == OP_CRMINRANGE);
1726         min = GET2(ecode, 1);
1727         max = GET2(ecode, 3);
1728         if (max == 0) max = INT_MAX;
1729         ecode += 5;
1730         break;
1731
1732         default:               /* No repeat follows */
1733         min = max = 1;
1734         break;
1735         }
1736
1737       /* First, ensure the minimum number of matches are present. */
1738
1739       for (i = 1; i <= min; i++)
1740         {
1741         if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1742         GETCHARINC(c, eptr);
1743         if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1744         }
1745
1746       /* If max == min we can continue with the main loop without the
1747       need to recurse. */
1748
1749       if (min == max) continue;
1750
1751       /* If minimizing, keep testing the rest of the expression and advancing
1752       the pointer while it matches the class. */
1753
1754       if (minimize)
1755         {
1756         for (fi = min;; fi++)
1757           {
1758           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1759           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1760           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1761           GETCHARINC(c, eptr);
1762           if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1763           }
1764         /* Control never gets here */
1765         }
1766
1767       /* If maximizing, find the longest possible run, then work backwards. */
1768
1769       else
1770         {
1771         pp = eptr;
1772         for (i = min; i < max; i++)
1773           {
1774           int len = 1;
1775           if (eptr >= md->end_subject) break;
1776           GETCHARLEN(c, eptr, len);
1777           if (!_pcre_xclass(c, data)) break;
1778           eptr += len;
1779           }
1780         for(;;)
1781           {
1782           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1783           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1784           if (eptr-- == pp) break;        /* Stop if tried at original pos */
1785           BACKCHAR(eptr)
1786           }
1787         RRETURN(MATCH_NOMATCH);
1788         }
1789
1790       /* Control never gets here */
1791       }
1792 #endif    /* End of XCLASS */
1793
1794     /* Match a single character, casefully */
1795
1796     case OP_CHAR:
1797 #ifdef SUPPORT_UTF8
1798     if (utf8)
1799       {
1800       length = 1;
1801       ecode++;
1802       GETCHARLEN(fc, ecode, length);
1803       if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1804       while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1805       }
1806     else
1807 #endif
1808
1809     /* Non-UTF-8 mode */
1810       {
1811       if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1812       if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1813       ecode += 2;
1814       }
1815     break;
1816
1817     /* Match a single character, caselessly */
1818
1819     case OP_CHARNC:
1820 #ifdef SUPPORT_UTF8
1821     if (utf8)
1822       {
1823       length = 1;
1824       ecode++;
1825       GETCHARLEN(fc, ecode, length);
1826
1827       if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1828
1829       /* If the pattern character's value is < 128, we have only one byte, and
1830       can use the fast lookup table. */
1831
1832       if (fc < 128)
1833         {
1834         if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1835         }
1836
1837       /* Otherwise we must pick up the subject character */
1838
1839       else
1840         {
1841         int dc;
1842         GETCHARINC(dc, eptr);
1843         ecode += length;
1844
1845         /* If we have Unicode property support, we can use it to test the other
1846         case of the character, if there is one. The result of ucp_findchar() is
1847         < 0 if the char isn't found, and othercase is returned as zero if there
1848         isn't one. */
1849
1850         if (fc != dc)
1851           {
1852 #ifdef SUPPORT_UCP
1853           int chartype;
1854           int othercase;
1855           if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
1856 #endif
1857             RRETURN(MATCH_NOMATCH);
1858           }
1859         }
1860       }
1861     else
1862 #endif   /* SUPPORT_UTF8 */
1863
1864     /* Non-UTF-8 mode */
1865       {
1866       if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1867       if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1868       ecode += 2;
1869       }
1870     break;
1871
1872     /* Match a single character repeatedly; different opcodes share code. */
1873
1874     case OP_EXACT:
1875     min = max = GET2(ecode, 1);
1876     ecode += 3;
1877     goto REPEATCHAR;
1878
1879     case OP_UPTO:
1880     case OP_MINUPTO:
1881     min = 0;
1882     max = GET2(ecode, 1);
1883     minimize = *ecode == OP_MINUPTO;
1884     ecode += 3;
1885     goto REPEATCHAR;
1886
1887     case OP_STAR:
1888     case OP_MINSTAR:
1889     case OP_PLUS:
1890     case OP_MINPLUS:
1891     case OP_QUERY:
1892     case OP_MINQUERY:
1893     c = *ecode++ - OP_STAR;
1894     minimize = (c & 1) != 0;
1895     min = rep_min[c];                 /* Pick up values from tables; */
1896     max = rep_max[c];                 /* zero for max => infinity */
1897     if (max == 0) max = INT_MAX;
1898
1899     /* Common code for all repeated single-character matches. We can give
1900     up quickly if there are fewer than the minimum number of characters left in
1901     the subject. */
1902
1903     REPEATCHAR:
1904 #ifdef SUPPORT_UTF8
1905     if (utf8)
1906       {
1907       length = 1;
1908       charptr = ecode;
1909       GETCHARLEN(fc, ecode, length);
1910       if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1911       ecode += length;
1912
1913       /* Handle multibyte character matching specially here. There is
1914       support for caseless matching if UCP support is present. */
1915
1916       if (length > 1)
1917         {
1918         int oclength = 0;
1919         uschar occhars[8];
1920
1921 #ifdef SUPPORT_UCP
1922         int othercase;
1923         int chartype;
1924         if ((ims & PCRE_CASELESS) != 0 &&
1925              ucp_findchar(fc, &chartype, &othercase) >= 0 &&
1926              othercase > 0)
1927           oclength = _pcre_ord2utf8(othercase, occhars);
1928 #endif  /* SUPPORT_UCP */
1929
1930         for (i = 1; i <= min; i++)
1931           {
1932           if (memcmp(eptr, charptr, length) == 0) eptr += length;
1933           /* Need braces because of following else */
1934           else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1935           else
1936             {
1937             if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1938             eptr += oclength;
1939             }
1940           }
1941
1942         if (min == max) continue;
1943
1944         if (minimize)
1945           {
1946           for (fi = min;; fi++)
1947             {
1948             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1949             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1950             if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1951             if (memcmp(eptr, charptr, length) == 0) eptr += length;
1952             /* Need braces because of following else */
1953             else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1954             else
1955               {
1956               if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1957               eptr += oclength;
1958               }
1959             }
1960           /* Control never gets here */
1961           }
1962         else
1963           {
1964           pp = eptr;
1965           for (i = min; i < max; i++)
1966             {
1967             if (eptr > md->end_subject - length) break;
1968             if (memcmp(eptr, charptr, length) == 0) eptr += length;
1969             else if (oclength == 0) break;
1970             else
1971               {
1972               if (memcmp(eptr, occhars, oclength) != 0) break;
1973               eptr += oclength;
1974               }
1975             }
1976           while (eptr >= pp)
1977            {
1978            RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1979            if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1980            eptr -= length;
1981            }
1982           RRETURN(MATCH_NOMATCH);
1983           }
1984         /* Control never gets here */
1985         }
1986
1987       /* If the length of a UTF-8 character is 1, we fall through here, and
1988       obey the code as for non-UTF-8 characters below, though in this case the
1989       value of fc will always be < 128. */
1990       }
1991     else
1992 #endif  /* SUPPORT_UTF8 */
1993
1994     /* When not in UTF-8 mode, load a single-byte character. */
1995       {
1996       if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1997       fc = *ecode++;
1998       }
1999
2000     /* The value of fc at this point is always less than 256, though we may or
2001     may not be in UTF-8 mode. The code is duplicated for the caseless and
2002     caseful cases, for speed, since matching characters is likely to be quite
2003     common. First, ensure the minimum number of matches are present. If min =
2004     max, continue at the same level without recursing. Otherwise, if
2005     minimizing, keep trying the rest of the expression and advancing one
2006     matching character if failing, up to the maximum. Alternatively, if
2007     maximizing, find the maximum number of characters and work backwards. */
2008
2009     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2010       max, eptr));
2011
2012     if ((ims & PCRE_CASELESS) != 0)
2013       {
2014       fc = md->lcc[fc];
2015       for (i = 1; i <= min; i++)
2016         if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2017       if (min == max) continue;
2018       if (minimize)
2019         {
2020         for (fi = min;; fi++)
2021           {
2022           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2023           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2024           if (fi >= max || eptr >= md->end_subject ||
2025               fc != md->lcc[*eptr++])
2026             RRETURN(MATCH_NOMATCH);
2027           }
2028         /* Control never gets here */
2029         }
2030       else
2031         {
2032         pp = eptr;
2033         for (i = min; i < max; i++)
2034           {
2035           if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2036           eptr++;
2037           }
2038         while (eptr >= pp)
2039           {
2040           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2041           eptr--;
2042           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2043           }
2044         RRETURN(MATCH_NOMATCH);
2045         }
2046       /* Control never gets here */
2047       }
2048
2049     /* Caseful comparisons (includes all multi-byte characters) */
2050
2051     else
2052       {
2053       for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2054       if (min == max) continue;
2055       if (minimize)
2056         {
2057         for (fi = min;; fi++)
2058           {
2059           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2060           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2061           if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2062             RRETURN(MATCH_NOMATCH);
2063           }
2064         /* Control never gets here */
2065         }
2066       else
2067         {
2068         pp = eptr;
2069         for (i = min; i < max; i++)
2070           {
2071           if (eptr >= md->end_subject || fc != *eptr) break;
2072           eptr++;
2073           }
2074         while (eptr >= pp)
2075           {
2076           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2077           eptr--;
2078           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2079           }
2080         RRETURN(MATCH_NOMATCH);
2081         }
2082       }
2083     /* Control never gets here */
2084
2085     /* Match a negated single one-byte character. The character we are
2086     checking can be multibyte. */
2087
2088     case OP_NOT:
2089     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2090     ecode++;
2091     GETCHARINCTEST(c, eptr);
2092     if ((ims & PCRE_CASELESS) != 0)
2093       {
2094 #ifdef SUPPORT_UTF8
2095       if (c < 256)
2096 #endif
2097       c = md->lcc[c];
2098       if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2099       }
2100     else
2101       {
2102       if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2103       }
2104     break;
2105
2106     /* Match a negated single one-byte character repeatedly. This is almost a
2107     repeat of the code for a repeated single character, but I haven't found a
2108     nice way of commoning these up that doesn't require a test of the
2109     positive/negative option for each character match. Maybe that wouldn't add
2110     very much to the time taken, but character matching *is* what this is all
2111     about... */
2112
2113     case OP_NOTEXACT:
2114     min = max = GET2(ecode, 1);
2115     ecode += 3;
2116     goto REPEATNOTCHAR;
2117
2118     case OP_NOTUPTO:
2119     case OP_NOTMINUPTO:
2120     min = 0;
2121     max = GET2(ecode, 1);
2122     minimize = *ecode == OP_NOTMINUPTO;
2123     ecode += 3;
2124     goto REPEATNOTCHAR;
2125
2126     case OP_NOTSTAR:
2127     case OP_NOTMINSTAR:
2128     case OP_NOTPLUS:
2129     case OP_NOTMINPLUS:
2130     case OP_NOTQUERY:
2131     case OP_NOTMINQUERY:
2132     c = *ecode++ - OP_NOTSTAR;
2133     minimize = (c & 1) != 0;
2134     min = rep_min[c];                 /* Pick up values from tables; */
2135     max = rep_max[c];                 /* zero for max => infinity */
2136     if (max == 0) max = INT_MAX;
2137
2138     /* Common code for all repeated single-byte matches. We can give up quickly
2139     if there are fewer than the minimum number of bytes left in the
2140     subject. */
2141
2142     REPEATNOTCHAR:
2143     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2144     fc = *ecode++;
2145
2146     /* The code is duplicated for the caseless and caseful cases, for speed,
2147     since matching characters is likely to be quite common. First, ensure the
2148     minimum number of matches are present. If min = max, continue at the same
2149     level without recursing. Otherwise, if minimizing, keep trying the rest of
2150     the expression and advancing one matching character if failing, up to the
2151     maximum. Alternatively, if maximizing, find the maximum number of
2152     characters and work backwards. */
2153
2154     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2155       max, eptr));
2156
2157     if ((ims & PCRE_CASELESS) != 0)
2158       {
2159       fc = md->lcc[fc];
2160
2161 #ifdef SUPPORT_UTF8
2162       /* UTF-8 mode */
2163       if (utf8)
2164         {
2165         register int d;
2166         for (i = 1; i <= min; i++)
2167           {
2168           GETCHARINC(d, eptr);
2169           if (d < 256) d = md->lcc[d];
2170           if (fc == d) RRETURN(MATCH_NOMATCH);
2171           }
2172         }
2173       else
2174 #endif
2175
2176       /* Not UTF-8 mode */
2177         {
2178         for (i = 1; i <= min; i++)
2179           if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2180         }
2181
2182       if (min == max) continue;
2183
2184       if (minimize)
2185         {
2186 #ifdef SUPPORT_UTF8
2187         /* UTF-8 mode */
2188         if (utf8)
2189           {
2190           register int d;
2191           for (fi = min;; fi++)
2192             {
2193             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2194             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2195             GETCHARINC(d, eptr);
2196             if (d < 256) d = md->lcc[d];
2197             if (fi >= max || eptr >= md->end_subject || fc == d)
2198               RRETURN(MATCH_NOMATCH);
2199             }
2200           }
2201         else
2202 #endif
2203         /* Not UTF-8 mode */
2204           {
2205           for (fi = min;; fi++)
2206             {
2207             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2208             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2209             if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2210               RRETURN(MATCH_NOMATCH);
2211             }
2212           }
2213         /* Control never gets here */
2214         }
2215
2216       /* Maximize case */
2217
2218       else
2219         {
2220         pp = eptr;
2221
2222 #ifdef SUPPORT_UTF8
2223         /* UTF-8 mode */
2224         if (utf8)
2225           {
2226           register int d;
2227           for (i = min; i < max; i++)
2228             {
2229             int len = 1;
2230             if (eptr >= md->end_subject) break;
2231             GETCHARLEN(d, eptr, len);
2232             if (d < 256) d = md->lcc[d];
2233             if (fc == d) break;
2234             eptr += len;
2235             }
2236           for(;;)
2237             {
2238             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2239             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2240             if (eptr-- == pp) break;        /* Stop if tried at original pos */
2241             BACKCHAR(eptr);
2242             }
2243           }
2244         else
2245 #endif
2246         /* Not UTF-8 mode */
2247           {
2248           for (i = min; i < max; i++)
2249             {
2250             if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2251             eptr++;
2252             }
2253           while (eptr >= pp)
2254             {
2255             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2256             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2257             eptr--;
2258             }
2259           }
2260
2261         RRETURN(MATCH_NOMATCH);
2262         }
2263       /* Control never gets here */
2264       }
2265
2266     /* Caseful comparisons */
2267
2268     else
2269       {
2270 #ifdef SUPPORT_UTF8
2271       /* UTF-8 mode */
2272       if (utf8)
2273         {
2274         register int d;
2275         for (i = 1; i <= min; i++)
2276           {
2277           GETCHARINC(d, eptr);
2278           if (fc == d) RRETURN(MATCH_NOMATCH);
2279           }
2280         }
2281       else
2282 #endif
2283       /* Not UTF-8 mode */
2284         {
2285         for (i = 1; i <= min; i++)
2286           if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2287         }
2288
2289       if (min == max) continue;
2290
2291       if (minimize)
2292         {
2293 #ifdef SUPPORT_UTF8
2294         /* UTF-8 mode */
2295         if (utf8)
2296           {
2297           register int d;
2298           for (fi = min;; fi++)
2299             {
2300             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2301             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2302             GETCHARINC(d, eptr);
2303             if (fi >= max || eptr >= md->end_subject || fc == d)
2304               RRETURN(MATCH_NOMATCH);
2305             }
2306           }
2307         else
2308 #endif
2309         /* Not UTF-8 mode */
2310           {
2311           for (fi = min;; fi++)
2312             {
2313             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2314             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2315             if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2316               RRETURN(MATCH_NOMATCH);
2317             }
2318           }
2319         /* Control never gets here */
2320         }
2321
2322       /* Maximize case */
2323
2324       else
2325         {
2326         pp = eptr;
2327
2328 #ifdef SUPPORT_UTF8
2329         /* UTF-8 mode */
2330         if (utf8)
2331           {
2332           register int d;
2333           for (i = min; i < max; i++)
2334             {
2335             int len = 1;
2336             if (eptr >= md->end_subject) break;
2337             GETCHARLEN(d, eptr, len);
2338             if (fc == d) break;
2339             eptr += len;
2340             }
2341           for(;;)
2342             {
2343             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2344             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2345             if (eptr-- == pp) break;        /* Stop if tried at original pos */
2346             BACKCHAR(eptr);
2347             }
2348           }
2349         else
2350 #endif
2351         /* Not UTF-8 mode */
2352           {
2353           for (i = min; i < max; i++)
2354             {
2355             if (eptr >= md->end_subject || fc == *eptr) break;
2356             eptr++;
2357             }
2358           while (eptr >= pp)
2359             {
2360             RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2361             if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362             eptr--;
2363             }
2364           }
2365
2366         RRETURN(MATCH_NOMATCH);
2367         }
2368       }
2369     /* Control never gets here */
2370
2371     /* Match a single character type repeatedly; several different opcodes
2372     share code. This is very similar to the code for single characters, but we
2373     repeat it in the interests of efficiency. */
2374
2375     case OP_TYPEEXACT:
2376     min = max = GET2(ecode, 1);
2377     minimize = TRUE;
2378     ecode += 3;
2379     goto REPEATTYPE;
2380
2381     case OP_TYPEUPTO:
2382     case OP_TYPEMINUPTO:
2383     min = 0;
2384     max = GET2(ecode, 1);
2385     minimize = *ecode == OP_TYPEMINUPTO;
2386     ecode += 3;
2387     goto REPEATTYPE;
2388
2389     case OP_TYPESTAR:
2390     case OP_TYPEMINSTAR:
2391     case OP_TYPEPLUS:
2392     case OP_TYPEMINPLUS:
2393     case OP_TYPEQUERY:
2394     case OP_TYPEMINQUERY:
2395     c = *ecode++ - OP_TYPESTAR;
2396     minimize = (c & 1) != 0;
2397     min = rep_min[c];                 /* Pick up values from tables; */
2398     max = rep_max[c];                 /* zero for max => infinity */
2399     if (max == 0) max = INT_MAX;
2400
2401     /* Common code for all repeated single character type matches. Note that
2402     in UTF-8 mode, '.' matches a character of any length, but for the other
2403     character types, the valid characters are all one-byte long. */
2404
2405     REPEATTYPE:
2406     ctype = *ecode++;      /* Code for the character type */
2407
2408 #ifdef SUPPORT_UCP
2409     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2410       {
2411       prop_fail_result = ctype == OP_NOTPROP;
2412       prop_type = *ecode++;
2413       if (prop_type >= 128)
2414         {
2415         prop_test_against = prop_type - 128;
2416         prop_test_variable = &prop_category;
2417         }
2418       else
2419         {
2420         prop_test_against = prop_type;
2421         prop_test_variable = &prop_chartype;
2422         }
2423       }
2424     else prop_type = -1;
2425 #endif
2426
2427     /* First, ensure the minimum number of matches are present. Use inline
2428     code for maximizing the speed, and do the type test once at the start
2429     (i.e. keep it out of the loop). Also we can test that there are at least
2430     the minimum number of bytes before we start. This isn't as effective in
2431     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2432     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2433     and single-bytes. */
2434
2435     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2436     if (min > 0)
2437       {
2438 #ifdef SUPPORT_UCP
2439       if (prop_type > 0)
2440         {
2441         for (i = 1; i <= min; i++)
2442           {
2443           GETCHARINC(c, eptr);
2444           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2445           if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2446             RRETURN(MATCH_NOMATCH);
2447           }
2448         }
2449
2450       /* Match extended Unicode sequences. We will get here only if the
2451       support is in the binary; otherwise a compile-time error occurs. */
2452
2453       else if (ctype == OP_EXTUNI)
2454         {
2455         for (i = 1; i <= min; i++)
2456           {
2457           GETCHARINCTEST(c, eptr);
2458           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2459           if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2460           while (eptr < md->end_subject)
2461             {
2462             int len = 1;
2463             if (!utf8) c = *eptr; else
2464               {
2465               GETCHARLEN(c, eptr, len);
2466               }
2467             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2468             if (prop_category != ucp_M) break;
2469             eptr += len;
2470             }
2471           }
2472         }
2473
2474       else
2475 #endif     /* SUPPORT_UCP */
2476
2477 /* Handle all other cases when the coding is UTF-8 */
2478
2479 #ifdef SUPPORT_UTF8
2480       if (utf8) switch(ctype)
2481         {
2482         case OP_ANY:
2483         for (i = 1; i <= min; i++)
2484           {
2485           if (eptr >= md->end_subject ||
2486              (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
2487             RRETURN(MATCH_NOMATCH);
2488           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2489           }
2490         break;
2491
2492         case OP_ANYBYTE:
2493         eptr += min;
2494         break;
2495
2496         case OP_NOT_DIGIT:
2497         for (i = 1; i <= min; i++)
2498           {
2499           if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2500           GETCHARINC(c, eptr);
2501           if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2502             RRETURN(MATCH_NOMATCH);
2503           }
2504         break;
2505
2506         case OP_DIGIT:
2507         for (i = 1; i <= min; i++)
2508           {
2509           if (eptr >= md->end_subject ||
2510              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2511             RRETURN(MATCH_NOMATCH);
2512           /* No need to skip more bytes - we know it's a 1-byte character */
2513           }
2514         break;
2515
2516         case OP_NOT_WHITESPACE:
2517         for (i = 1; i <= min; i++)
2518           {
2519           if (eptr >= md->end_subject ||
2520              (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2521             RRETURN(MATCH_NOMATCH);
2522           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2523           }
2524         break;
2525
2526         case OP_WHITESPACE:
2527         for (i = 1; i <= min; i++)
2528           {
2529           if (eptr >= md->end_subject ||
2530              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2531             RRETURN(MATCH_NOMATCH);
2532           /* No need to skip more bytes - we know it's a 1-byte character */
2533           }
2534         break;
2535
2536         case OP_NOT_WORDCHAR:
2537         for (i = 1; i <= min; i++)
2538           {
2539           if (eptr >= md->end_subject ||
2540              (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2541             RRETURN(MATCH_NOMATCH);
2542           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2543           }
2544         break;
2545
2546         case OP_WORDCHAR:
2547         for (i = 1; i <= min; i++)
2548           {
2549           if (eptr >= md->end_subject ||
2550              *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2551             RRETURN(MATCH_NOMATCH);
2552           /* No need to skip more bytes - we know it's a 1-byte character */
2553           }
2554         break;
2555
2556         default:
2557         RRETURN(PCRE_ERROR_INTERNAL);
2558         }  /* End switch(ctype) */
2559
2560       else
2561 #endif     /* SUPPORT_UTF8 */
2562
2563       /* Code for the non-UTF-8 case for minimum matching of operators other
2564       than OP_PROP and OP_NOTPROP. */
2565
2566       switch(ctype)
2567         {
2568         case OP_ANY:
2569         if ((ims & PCRE_DOTALL) == 0)
2570           {
2571           for (i = 1; i <= min; i++)
2572             if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
2573           }
2574         else eptr += min;
2575         break;
2576
2577         case OP_ANYBYTE:
2578         eptr += min;
2579         break;
2580
2581         case OP_NOT_DIGIT:
2582         for (i = 1; i <= min; i++)
2583           if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2584         break;
2585
2586         case OP_DIGIT:
2587         for (i = 1; i <= min; i++)
2588           if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2589         break;
2590
2591         case OP_NOT_WHITESPACE:
2592         for (i = 1; i <= min; i++)
2593           if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2594         break;
2595
2596         case OP_WHITESPACE:
2597         for (i = 1; i <= min; i++)
2598           if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2599         break;
2600
2601         case OP_NOT_WORDCHAR:
2602         for (i = 1; i <= min; i++)
2603           if ((md->ctypes[*eptr++] & ctype_word) != 0)
2604             RRETURN(MATCH_NOMATCH);
2605         break;
2606
2607         case OP_WORDCHAR:
2608         for (i = 1; i <= min; i++)
2609           if ((md->ctypes[*eptr++] & ctype_word) == 0)
2610             RRETURN(MATCH_NOMATCH);
2611         break;
2612
2613         default:
2614         RRETURN(PCRE_ERROR_INTERNAL);
2615         }
2616       }
2617
2618     /* If min = max, continue at the same level without recursing */
2619
2620     if (min == max) continue;
2621
2622     /* If minimizing, we have to test the rest of the pattern before each
2623     subsequent match. Again, separate the UTF-8 case for speed, and also
2624     separate the UCP cases. */
2625
2626     if (minimize)
2627       {
2628 #ifdef SUPPORT_UCP
2629       if (prop_type > 0)
2630         {
2631         for (fi = min;; fi++)
2632           {
2633           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2634           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2635           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2636           GETCHARINC(c, eptr);
2637           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2638           if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2639             RRETURN(MATCH_NOMATCH);
2640           }
2641         }
2642
2643       /* Match extended Unicode sequences. We will get here only if the
2644       support is in the binary; otherwise a compile-time error occurs. */
2645
2646       else if (ctype == OP_EXTUNI)
2647         {
2648         for (fi = min;; fi++)
2649           {
2650           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2651           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2652           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2653           GETCHARINCTEST(c, eptr);
2654           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2655           if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2656           while (eptr < md->end_subject)
2657             {
2658             int len = 1;
2659             if (!utf8) c = *eptr; else
2660               {
2661               GETCHARLEN(c, eptr, len);
2662               }
2663             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2664             if (prop_category != ucp_M) break;
2665             eptr += len;
2666             }
2667           }
2668         }
2669
2670       else
2671 #endif     /* SUPPORT_UCP */
2672
2673 #ifdef SUPPORT_UTF8
2674       /* UTF-8 mode */
2675       if (utf8)
2676         {
2677         for (fi = min;; fi++)
2678           {
2679           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2680           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2681           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2682
2683           GETCHARINC(c, eptr);
2684           switch(ctype)
2685             {
2686             case OP_ANY:
2687             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2688             break;
2689
2690             case OP_ANYBYTE:
2691             break;
2692
2693             case OP_NOT_DIGIT:
2694             if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2695               RRETURN(MATCH_NOMATCH);
2696             break;
2697
2698             case OP_DIGIT:
2699             if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2700               RRETURN(MATCH_NOMATCH);
2701             break;
2702
2703             case OP_NOT_WHITESPACE:
2704             if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2705               RRETURN(MATCH_NOMATCH);
2706             break;
2707
2708             case OP_WHITESPACE:
2709             if  (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2710               RRETURN(MATCH_NOMATCH);
2711             break;
2712
2713             case OP_NOT_WORDCHAR:
2714             if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2715               RRETURN(MATCH_NOMATCH);
2716             break;
2717
2718             case OP_WORDCHAR:
2719             if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2720               RRETURN(MATCH_NOMATCH);
2721             break;
2722
2723             default:
2724             RRETURN(PCRE_ERROR_INTERNAL);
2725             }
2726           }
2727         }
2728       else
2729 #endif
2730       /* Not UTF-8 mode */
2731         {
2732         for (fi = min;; fi++)
2733           {
2734           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2735           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2736           if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2737           c = *eptr++;
2738           switch(ctype)
2739             {
2740             case OP_ANY:
2741             if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2742             break;
2743
2744             case OP_ANYBYTE:
2745             break;
2746
2747             case OP_NOT_DIGIT:
2748             if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2749             break;
2750
2751             case OP_DIGIT:
2752             if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2753             break;
2754
2755             case OP_NOT_WHITESPACE:
2756             if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2757             break;
2758
2759             case OP_WHITESPACE:
2760             if  ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2761             break;
2762
2763             case OP_NOT_WORDCHAR:
2764             if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2765             break;
2766
2767             case OP_WORDCHAR:
2768             if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2769             break;
2770
2771             default:
2772             RRETURN(PCRE_ERROR_INTERNAL);
2773             }
2774           }
2775         }
2776       /* Control never gets here */
2777       }
2778
2779     /* If maximizing it is worth using inline code for speed, doing the type
2780     test once at the start (i.e. keep it out of the loop). Again, keep the
2781     UTF-8 and UCP stuff separate. */
2782
2783     else
2784       {
2785       pp = eptr;  /* Remember where we started */
2786
2787 #ifdef SUPPORT_UCP
2788       if (prop_type > 0)
2789         {
2790         for (i = min; i < max; i++)
2791           {
2792           int len = 1;
2793           if (eptr >= md->end_subject) break;
2794           GETCHARLEN(c, eptr, len);
2795           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2796           if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2797             break;
2798           eptr+= len;
2799           }
2800
2801         /* eptr is now past the end of the maximum run */
2802
2803         for(;;)
2804           {
2805           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2806           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2807           if (eptr-- == pp) break;        /* Stop if tried at original pos */
2808           BACKCHAR(eptr);
2809           }
2810         }
2811
2812       /* Match extended Unicode sequences. We will get here only if the
2813       support is in the binary; otherwise a compile-time error occurs. */
2814
2815       else if (ctype == OP_EXTUNI)
2816         {
2817         for (i = min; i < max; i++)
2818           {
2819           if (eptr >= md->end_subject) break;
2820           GETCHARINCTEST(c, eptr);
2821           prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2822           if (prop_category == ucp_M) break;
2823           while (eptr < md->end_subject)
2824             {
2825             int len = 1;
2826             if (!utf8) c = *eptr; else
2827               {
2828               GETCHARLEN(c, eptr, len);
2829               }
2830             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2831             if (prop_category != ucp_M) break;
2832             eptr += len;
2833             }
2834           }
2835
2836         /* eptr is now past the end of the maximum run */
2837
2838         for(;;)
2839           {
2840           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2841           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2842           if (eptr-- == pp) break;        /* Stop if tried at original pos */
2843           for (;;)                        /* Move back over one extended */
2844             {
2845             int len = 1;
2846             BACKCHAR(eptr);
2847             if (!utf8) c = *eptr; else
2848               {
2849               GETCHARLEN(c, eptr, len);
2850               }
2851             prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2852             if (prop_category != ucp_M) break;
2853             eptr--;
2854             }
2855           }
2856         }
2857
2858       else
2859 #endif   /* SUPPORT_UCP */
2860
2861 #ifdef SUPPORT_UTF8
2862       /* UTF-8 mode */
2863
2864       if (utf8)
2865         {
2866         switch(ctype)
2867           {
2868           case OP_ANY:
2869
2870           /* Special code is required for UTF8, but when the maximum is unlimited
2871           we don't need it, so we repeat the non-UTF8 code. This is probably
2872           worth it, because .* is quite a common idiom. */
2873
2874           if (max < INT_MAX)
2875             {
2876             if ((ims & PCRE_DOTALL) == 0)
2877               {
2878               for (i = min; i < max; i++)
2879                 {
2880                 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2881                 eptr++;
2882                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2883                 }
2884               }
2885             else
2886               {
2887               for (i = min; i < max; i++)
2888                 {
2889                 eptr++;
2890                 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2891                 }
2892               }
2893             }
2894
2895           /* Handle unlimited UTF-8 repeat */
2896
2897           else
2898             {
2899             if ((ims & PCRE_DOTALL) == 0)
2900               {
2901               for (i = min; i < max; i++)
2902                 {
2903                 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2904                 eptr++;
2905                 }
2906               break;
2907               }
2908             else
2909               {
2910               c = max - min;
2911               if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2912               eptr += c;
2913               }
2914             }
2915           break;
2916
2917           /* The byte case is the same as non-UTF8 */
2918
2919           case OP_ANYBYTE:
2920           c = max - min;
2921           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2922           eptr += c;
2923           break;
2924
2925           case OP_NOT_DIGIT:
2926           for (i = min; i < max; i++)
2927             {
2928             int len = 1;
2929             if (eptr >= md->end_subject) break;
2930             GETCHARLEN(c, eptr, len);
2931             if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
2932             eptr+= len;
2933             }
2934           break;
2935
2936           case OP_DIGIT:
2937           for (i = min; i < max; i++)
2938             {
2939             int len = 1;
2940             if (eptr >= md->end_subject) break;
2941             GETCHARLEN(c, eptr, len);
2942             if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
2943             eptr+= len;
2944             }
2945           break;
2946
2947           case OP_NOT_WHITESPACE:
2948           for (i = min; i < max; i++)
2949             {
2950             int len = 1;
2951             if (eptr >= md->end_subject) break;
2952             GETCHARLEN(c, eptr, len);
2953             if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
2954             eptr+= len;
2955             }
2956           break;
2957
2958           case OP_WHITESPACE:
2959           for (i = min; i < max; i++)
2960             {
2961             int len = 1;
2962             if (eptr >= md->end_subject) break;
2963             GETCHARLEN(c, eptr, len);
2964             if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
2965             eptr+= len;
2966             }
2967           break;
2968
2969           case OP_NOT_WORDCHAR:
2970           for (i = min; i < max; i++)
2971             {
2972             int len = 1;
2973             if (eptr >= md->end_subject) break;
2974             GETCHARLEN(c, eptr, len);
2975             if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
2976             eptr+= len;
2977             }
2978           break;
2979
2980           case OP_WORDCHAR:
2981           for (i = min; i < max; i++)
2982             {
2983             int len = 1;
2984             if (eptr >= md->end_subject) break;
2985             GETCHARLEN(c, eptr, len);
2986             if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
2987             eptr+= len;
2988             }
2989           break;
2990
2991           default:
2992           RRETURN(PCRE_ERROR_INTERNAL);
2993           }
2994
2995         /* eptr is now past the end of the maximum run */
2996
2997         for(;;)
2998           {
2999           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3000           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3001           if (eptr-- == pp) break;        /* Stop if tried at original pos */
3002           BACKCHAR(eptr);
3003           }
3004         }
3005       else
3006 #endif
3007
3008       /* Not UTF-8 mode */
3009         {
3010         switch(ctype)
3011           {
3012           case OP_ANY:
3013           if ((ims & PCRE_DOTALL) == 0)
3014             {
3015             for (i = min; i < max; i++)
3016               {
3017               if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3018               eptr++;
3019               }
3020             break;
3021             }
3022           /* For DOTALL case, fall through and treat as \C */
3023
3024           case OP_ANYBYTE:
3025           c = max - min;
3026           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3027           eptr += c;
3028           break;
3029
3030           case OP_NOT_DIGIT:
3031           for (i = min; i < max; i++)
3032             {
3033             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3034               break;
3035             eptr++;
3036             }
3037           break;
3038
3039           case OP_DIGIT:
3040           for (i = min; i < max; i++)
3041             {
3042             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3043               break;
3044             eptr++;
3045             }
3046           break;
3047
3048           case OP_NOT_WHITESPACE:
3049           for (i = min; i < max; i++)
3050             {
3051             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3052               break;
3053             eptr++;
3054             }
3055           break;
3056
3057           case OP_WHITESPACE:
3058           for (i = min; i < max; i++)
3059             {
3060             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3061               break;
3062             eptr++;
3063             }
3064           break;
3065
3066           case OP_NOT_WORDCHAR:
3067           for (i = min; i < max; i++)
3068             {
3069             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3070               break;
3071             eptr++;
3072             }
3073           break;
3074
3075           case OP_WORDCHAR:
3076           for (i = min; i < max; i++)
3077             {
3078             if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3079               break;
3080             eptr++;
3081             }
3082           break;
3083
3084           default:
3085           RRETURN(PCRE_ERROR_INTERNAL);
3086           }
3087
3088         /* eptr is now past the end of the maximum run */
3089
3090         while (eptr >= pp)
3091           {
3092           RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3093           eptr--;
3094           if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3095           }
3096         }
3097
3098       /* Get here if we can't make it match with any permitted repetitions */
3099
3100       RRETURN(MATCH_NOMATCH);
3101       }
3102     /* Control never gets here */
3103
3104     /* There's been some horrible disaster. Since all codes > OP_BRA are
3105     for capturing brackets, and there shouldn't be any gaps between 0 and
3106     OP_BRA, arrival here can only mean there is something seriously wrong
3107     in the code above or the OP_xxx definitions. */
3108
3109     default:
3110     DPRINTF(("Unknown opcode %d\n", *ecode));
3111     RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3112     }
3113
3114   /* Do not stick any code in here without much thought; it is assumed
3115   that "continue" in the code above comes out to here to repeat the main
3116   loop. */
3117
3118   }             /* End of main loop */
3119 /* Control never reaches here */
3120 }
3121
3122
3123 /***************************************************************************
3124 ****************************************************************************
3125                    RECURSION IN THE match() FUNCTION
3126
3127 Undefine all the macros that were defined above to handle this. */
3128
3129 #ifdef NO_RECURSE
3130 #undef eptr
3131 #undef ecode
3132 #undef offset_top
3133 #undef ims
3134 #undef eptrb
3135 #undef flags
3136
3137 #undef callpat
3138 #undef charptr
3139 #undef data
3140 #undef next
3141 #undef pp
3142 #undef prev
3143 #undef saved_eptr
3144
3145 #undef new_recursive
3146
3147 #undef cur_is_word
3148 #undef condition
3149 #undef minimize
3150 #undef prev_is_word
3151
3152 #undef original_ims
3153
3154 #undef ctype
3155 #undef length
3156 #undef max
3157 #undef min
3158 #undef number
3159 #undef offset
3160 #undef op
3161 #undef save_capture_last
3162 #undef save_offset1
3163 #undef save_offset2
3164 #undef save_offset3
3165 #undef stacksave
3166
3167 #undef newptrb
3168
3169 #endif
3170
3171 /* These two are defined as macros in both cases */
3172
3173 #undef fc
3174 #undef fi
3175
3176 /***************************************************************************
3177 ***************************************************************************/
3178
3179
3180
3181 /*************************************************
3182 *         Execute a Regular Expression           *
3183 *************************************************/
3184
3185 /* This function applies a compiled re to a subject string and picks out
3186 portions of the string if it matches. Two elements in the vector are set for
3187 each substring: the offsets to the start and end of the substring.
3188
3189 Arguments:
3190   argument_re     points to the compiled expression
3191   extra_data      points to extra data or is NULL
3192   subject         points to the subject string
3193   length          length of subject string (may contain binary zeros)
3194   start_offset    where to start in the subject string
3195   options         option bits
3196   offsets         points to a vector of ints to be filled in with offsets
3197   offsetcount     the number of elements in the vector
3198
3199 Returns:          > 0 => success; value is the number of elements filled in
3200                   = 0 => success, but offsets is not big enough
3201                    -1 => failed to match
3202                  < -1 => some kind of unexpected problem
3203 */
3204
3205 EXPORT int
3206 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3207   const char *subject, int length, int start_offset, int options, int *offsets,
3208   int offsetcount)
3209 {
3210 int rc, resetcount, ocount;
3211 int first_byte = -1;
3212 int req_byte = -1;
3213 int req_byte2 = -1;
3214 unsigned long int ims = 0;
3215 BOOL using_temporary_offsets = FALSE;
3216 BOOL anchored;
3217 BOOL startline;
3218 BOOL firstline;
3219 BOOL first_byte_caseless = FALSE;
3220 BOOL req_byte_caseless = FALSE;
3221 match_data match_block;
3222 const uschar *tables;
3223 const uschar *start_bits = NULL;
3224 const uschar *start_match = (const uschar *)subject + start_offset;
3225 const uschar *end_subject;
3226 const uschar *req_byte_ptr = start_match - 1;
3227
3228 pcre_study_data internal_study;
3229 const pcre_study_data *study;
3230
3231 real_pcre internal_re;
3232 const real_pcre *external_re = (const real_pcre *)argument_re;
3233 const real_pcre *re = external_re;
3234
3235 /* Plausibility checks */
3236
3237 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3238 if (re == NULL || subject == NULL ||
3239    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3240 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3241
3242 /* Fish out the optional data from the extra_data structure, first setting
3243 the default values. */
3244
3245 study = NULL;
3246 match_block.match_limit = MATCH_LIMIT;
3247 match_block.callout_data = NULL;
3248
3249 /* The table pointer is always in native byte order. */
3250
3251 tables = external_re->tables;
3252
3253 if (extra_data != NULL)
3254   {
3255   register unsigned int flags = extra_data->flags;
3256   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3257     study = (const pcre_study_data *)extra_data->study_data;
3258   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3259     match_block.match_limit = extra_data->match_limit;
3260   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3261     match_block.callout_data = extra_data->callout_data;
3262   if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3263   }
3264
3265 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3266 is a feature that makes it possible to save compiled regex and re-use them
3267 in other programs later. */
3268
3269 if (tables == NULL) tables = _pcre_default_tables;
3270
3271 /* Check that the first field in the block is the magic number. If it is not,
3272 test for a regex that was compiled on a host of opposite endianness. If this is
3273 the case, flipped values are put in internal_re and internal_study if there was
3274 study data too. */
3275
3276 if (re->magic_number != MAGIC_NUMBER)
3277   {
3278   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3279   if (re == NULL) return PCRE_ERROR_BADMAGIC;
3280   if (study != NULL) study = &internal_study;
3281   }
3282
3283 /* Set up other data */
3284
3285 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3286 startline = (re->options & PCRE_STARTLINE) != 0;
3287 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3288
3289 /* The code starts after the real_pcre block and the capture name table. */
3290
3291 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
3292   re->name_count * re->name_entry_size;
3293
3294 match_block.start_subject = (const uschar *)subject;
3295 match_block.start_offset = start_offset;
3296 match_block.end_subject = match_block.start_subject + length;
3297 end_subject = match_block.end_subject;
3298
3299 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3300 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
3301
3302 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3303 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3304 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
3305 match_block.partial = (options & PCRE_PARTIAL) != 0;
3306 match_block.hitend = FALSE;
3307
3308 match_block.recursive = NULL;                   /* No recursion at top level */
3309
3310 match_block.lcc = tables + lcc_offset;
3311 match_block.ctypes = tables + ctypes_offset;
3312
3313 /* Partial matching is supported only for a restricted set of regexes at the
3314 moment. */
3315
3316 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
3317   return PCRE_ERROR_BADPARTIAL;
3318
3319 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3320 back the character offset. */
3321
3322 #ifdef SUPPORT_UTF8
3323 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3324   {
3325   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3326     return PCRE_ERROR_BADUTF8;
3327   if (start_offset > 0 && start_offset < length)
3328     {
3329     int tb = ((uschar *)subject)[start_offset];
3330     if (tb > 127)
3331       {
3332       tb &= 0xc0;
3333       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3334       }
3335     }
3336   }
3337 #endif
3338
3339 /* The ims options can vary during the matching as a result of the presence
3340 of (?ims) items in the pattern. They are kept in a local variable so that
3341 restoring at the exit of a group is easy. */
3342
3343 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3344
3345 /* If the expression has got more back references than the offsets supplied can
3346 hold, we get a temporary chunk of working store to use during the matching.
3347 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3348 of 3. */
3349
3350 ocount = offsetcount - (offsetcount % 3);
3351
3352 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3353   {
3354   ocount = re->top_backref * 3 + 3;
3355   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3356   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3357   using_temporary_offsets = TRUE;
3358   DPRINTF(("Got memory to hold back references\n"));
3359   }
3360 else match_block.offset_vector = offsets;
3361
3362 match_block.offset_end = ocount;
3363 match_block.offset_max = (2*ocount)/3;
3364 match_block.offset_overflow = FALSE;
3365 match_block.capture_last = -1;
3366
3367 /* Compute the minimum number of offsets that we need to reset each time. Doing
3368 this makes a huge difference to execution time when there aren't many brackets
3369 in the pattern. */
3370
3371 resetcount = 2 + re->top_bracket * 2;
3372 if (resetcount > offsetcount) resetcount = ocount;
3373
3374 /* Reset the working variable associated with each extraction. These should
3375 never be used unless previously set, but they get saved and restored, and so we
3376 initialize them to avoid reading uninitialized locations. */
3377
3378 if (match_block.offset_vector != NULL)
3379   {
3380   register int *iptr = match_block.offset_vector + ocount;
3381   register int *iend = iptr - resetcount/2 + 1;
3382   while (--iptr >= iend) *iptr = -1;
3383   }
3384
3385 /* Set up the first character to match, if available. The first_byte value is
3386 never set for an anchored regular expression, but the anchoring may be forced
3387 at run time, so we have to test for anchoring. The first char may be unset for
3388 an unanchored pattern, of course. If there's no first char and the pattern was
3389 studied, there may be a bitmap of possible first characters. */
3390
3391 if (!anchored)
3392   {
3393   if ((re->options & PCRE_FIRSTSET) != 0)
3394     {
3395     first_byte = re->first_byte & 255;
3396     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3397       first_byte = match_block.lcc[first_byte];
3398     }
3399   else
3400     if (!startline && study != NULL &&
3401       (study->options & PCRE_STUDY_MAPPED) != 0)
3402         start_bits = study->start_bits;
3403   }
3404
3405 /* For anchored or unanchored matches, there may be a "last known required
3406 character" set. */
3407
3408 if ((re->options & PCRE_REQCHSET) != 0)
3409   {
3410   req_byte = re->req_byte & 255;
3411   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3412   req_byte2 = (tables + fcc_offset)[req_byte];  /* case flipped */
3413   }
3414
3415 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3416 the loop runs just once. */
3417
3418 do
3419   {
3420   const uschar *save_end_subject = end_subject;
3421
3422   /* Reset the maximum number of extractions we might see. */
3423
3424   if (match_block.offset_vector != NULL)
3425     {
3426     register int *iptr = match_block.offset_vector;
3427     register int *iend = iptr + resetcount;
3428     while (iptr < iend) *iptr++ = -1;
3429     }
3430
3431   /* Advance to a unique first char if possible. If firstline is TRUE, the
3432   start of the match is constrained to the first line of a multiline string.
3433   Implement this by temporarily adjusting end_subject so that we stop scanning
3434   at a newline. If the match fails at the newline, later code breaks this loop.
3435   */
3436
3437   if (firstline)
3438     {
3439     const uschar *t = start_match;
3440     while (t < save_end_subject && *t != '\n') t++;
3441     end_subject = t;
3442     }
3443
3444   /* Now test for a unique first byte */
3445
3446   if (first_byte >= 0)
3447     {
3448     if (first_byte_caseless)
3449       while (start_match < end_subject &&
3450              match_block.lcc[*start_match] != first_byte)
3451         start_match++;
3452     else
3453       while (start_match < end_subject && *start_match != first_byte)
3454         start_match++;
3455     }
3456
3457   /* Or to just after \n for a multiline match if possible */
3458
3459   else if (startline)
3460     {
3461     if (start_match > match_block.start_subject + start_offset)
3462       {
3463       while (start_match < end_subject && start_match[-1] != NEWLINE)
3464         start_match++;
3465       }
3466     }
3467
3468   /* Or to a non-unique first char after study */
3469
3470   else if (start_bits != NULL)
3471     {
3472     while (start_match < end_subject)
3473       {
3474       register unsigned int c = *start_match;
3475       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3476       }
3477     }
3478
3479   /* Restore fudged end_subject */
3480
3481   end_subject = save_end_subject;
3482
3483 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
3484   printf(">>>> Match against: ");
3485   pchars(start_match, end_subject - start_match, TRUE, &match_block);
3486   printf("\n");
3487 #endif
3488
3489   /* If req_byte is set, we know that that character must appear in the subject
3490   for the match to succeed. If the first character is set, req_byte must be
3491   later in the subject; otherwise the test starts at the match point. This
3492   optimization can save a huge amount of backtracking in patterns with nested
3493   unlimited repeats that aren't going to match. Writing separate code for
3494   cased/caseless versions makes it go faster, as does using an autoincrement
3495   and backing off on a match.
3496
3497   HOWEVER: when the subject string is very, very long, searching to its end can
3498   take a long time, and give bad performance on quite ordinary patterns. This
3499   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3500   don't do this when the string is sufficiently long.
3501
3502   ALSO: this processing is disabled when partial matching is requested.
3503   */
3504
3505   if (req_byte >= 0 &&
3506       end_subject - start_match < REQ_BYTE_MAX &&
3507       !match_block.partial)
3508     {
3509     register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
3510
3511     /* We don't need to repeat the search if we haven't yet reached the
3512     place we found it at last time. */
3513
3514     if (p > req_byte_ptr)
3515       {
3516       if (req_byte_caseless)
3517         {
3518         while (p < end_subject)
3519           {
3520           register int pp = *p++;
3521           if (pp == req_byte || pp == req_byte2) { p--; break; }
3522           }
3523         }
3524       else
3525         {
3526         while (p < end_subject)
3527           {
3528           if (*p++ == req_byte) { p--; break; }
3529           }
3530         }
3531
3532       /* If we can't find the required character, break the matching loop */
3533
3534       if (p >= end_subject) break;
3535
3536       /* If we have found the required character, save the point where we
3537       found it, so that we don't search again next time round the loop if
3538       the start hasn't passed this character yet. */
3539
3540       req_byte_ptr = p;
3541       }
3542     }
3543
3544   /* When a match occurs, substrings will be set for all internal extractions;
3545   we just need to set up the whole thing as substring 0 before returning. If
3546   there were too many extractions, set the return code to zero. In the case
3547   where we had to get some local store to hold offsets for backreferences, copy
3548   those back references that we can. In this case there need not be overflow
3549   if certain parts of the pattern were not used. */
3550
3551   match_block.start_match = start_match;
3552   match_block.match_call_count = 0;
3553
3554   rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
3555     match_isgroup);
3556
3557   /* When the result is no match, if the subject's first character was a
3558   newline and the PCRE_FIRSTLINE option is set, break (which will return
3559   PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3560   newline in the subject. Otherwise, advance the pointer to the next character
3561   and continue - but the continuation will actually happen only when the
3562   pattern is not anchored. */
3563
3564   if (rc == MATCH_NOMATCH)
3565     {
3566     if (firstline && *start_match == NEWLINE) break;
3567     start_match++;
3568 #ifdef SUPPORT_UTF8
3569     if (match_block.utf8)
3570       while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3571         start_match++;
3572 #endif
3573     continue;
3574     }
3575
3576   if (rc != MATCH_MATCH)
3577     {
3578     DPRINTF((">>>> error: returning %d\n", rc));
3579     return rc;
3580     }
3581
3582   /* We have a match! Copy the offset information from temporary store if
3583   necessary */
3584
3585   if (using_temporary_offsets)
3586     {
3587     if (offsetcount >= 4)
3588       {
3589       memcpy(offsets + 2, match_block.offset_vector + 2,
3590         (offsetcount - 2) * sizeof(int));
3591       DPRINTF(("Copied offsets from temporary memory\n"));
3592       }
3593     if (match_block.end_offset_top > offsetcount)
3594       match_block.offset_overflow = TRUE;
3595
3596     DPRINTF(("Freeing temporary memory\n"));
3597     (pcre_free)(match_block.offset_vector);
3598     }
3599
3600   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3601
3602   if (offsetcount < 2) rc = 0; else
3603     {
3604     offsets[0] = start_match - match_block.start_subject;
3605     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3606     }
3607
3608   DPRINTF((">>>> returning %d\n", rc));
3609   return rc;
3610   }
3611
3612 /* This "while" is the end of the "do" above */
3613
3614 while (!anchored && start_match <= end_subject);
3615
3616 if (using_temporary_offsets)
3617   {
3618   DPRINTF(("Freeing temporary memory\n"));
3619   (pcre_free)(match_block.offset_vector);
3620   }
3621
3622 if (match_block.partial && match_block.hitend)
3623   {
3624   DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3625   return PCRE_ERROR_PARTIAL;
3626   }
3627 else
3628   {
3629   DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3630   return PCRE_ERROR_NOMATCH;
3631   }
3632 }
3633
3634 /* End of pcre_exec.c */