src/src/pcre/pcre_compile.c

   1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */
   2
   3 /*************************************************
   4 *      Perl-Compatible Regular Expressions       *
   5 *************************************************/
   6
   7 /* PCRE is a library of functions to support regular expressions whose syntax
   8 and semantics are as close as possible to those of the Perl 5 language.
   9
  10                        Written by Philip Hazel
  11            Copyright (c) 1997-2006 University of Cambridge
  12
  13 -----------------------------------------------------------------------------
  14 Redistribution and use in source and binary forms, with or without
  15 modification, are permitted provided that the following conditions are met:
  16
  17     * Redistributions of source code must retain the above copyright notice,
  18       this list of conditions and the following disclaimer.
  19
  20     * Redistributions in binary form must reproduce the above copyright
  21       notice, this list of conditions and the following disclaimer in the
  22       documentation and/or other materials provided with the distribution.
  23
  24     * Neither the name of the University of Cambridge nor the names of its
  25       contributors may be used to endorse or promote products derived from
  26       this software without specific prior written permission.
  27
  28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 POSSIBILITY OF SUCH DAMAGE.
  39 -----------------------------------------------------------------------------
  40 */
  41
  42
  43 /* This module contains the external function pcre_compile(), along with
  44 supporting internal functions that are not used by other modules. */
  45
  46
  47 #define NLBLOCK cd             /* Block containing newline information */
  48 #define PSSTART start_pattern  /* Field containing processed string start */
  49 #define PSEND   end_pattern    /* Field containing processed string end */
  50
  51
  52 #include "pcre_internal.h"
  53
  54
  55 /* When DEBUG is defined, we need the pcre_printint() function, which is also
  56 used by pcretest. DEBUG is not defined when building a production library. */
  57
  58 #ifdef DEBUG
  59 #include "pcre_printint.src"
  60 #endif
  61
  62
  63 /*************************************************
  64 *      Code parameters and static tables         *
  65 *************************************************/
  66
  67 /* This value specifies the size of stack workspace that is used during the
  68 first pre-compile phase that determines how much memory is required. The regex
  69 is partly compiled into this space, but the compiled parts are discarded as
  70 soon as they can be, so that hopefully there will never be an overrun. The code
  71 does, however, check for an overrun. The largest amount I've seen used is 218,
  72 so this number is very generous.
  73
  74 The same workspace is used during the second, actual compile phase for
  75 remembering forward references to groups so that they can be filled in at the
  76 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  77 is 4 there is plenty of room. */
  78
  79 #define COMPILE_WORK_SIZE (4096)
  80
  81
  82 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  83 are simple data values; negative values are for special things like \d and so
  84 on. Zero means further processing is needed (for things like \x), or the escape
  85 is invalid. */
  86
  87 #if !EBCDIC   /* This is the "normal" table for ASCII systems */
  88 static const short int escapes[] = {
  89      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
  90      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
  91    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
  92      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
  93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
  94 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
  95    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
  96      0,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
  97 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0,      0, -ESC_w,   /* p - w */
  98      0,      0, -ESC_z                                            /* x - z */
  99 };
 100
 101 #else         /* This is the "abnormal" table for EBCDIC systems */
 102 static const short int escapes[] = {
 103 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 104 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 105 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 106 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 107 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 108 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 109 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 110 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 111 /*  88 */     0,     0,      0,     '{',      0,     0,      0,      0,
 112 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
 113 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 114 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,     0, -ESC_w,      0,
 115 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 116 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 117 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 118 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 119 /*  C8 */     0,     0,      0,       0,      0,     0,      0,      0,
 120 /*  D0 */   '}',     0,      0,       0,      0,     0,      0, -ESC_P,
 121 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 122 /*  E0 */  '\\',     0, -ESC_S,       0,      0,     0, -ESC_W, -ESC_X,
 123 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 124 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 125 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 126 };
 127 #endif
 128
 129
 130 /* Tables of names of POSIX character classes and their lengths. The list is
 131 terminated by a zero length entry. The first three must be alpha, lower, upper,
 132 as this is assumed for handling case independence. */
 133
 134 static const char *const posix_names[] = {
 135   "alpha", "lower", "upper",
 136   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
 137   "print", "punct", "space", "word",  "xdigit" };
 138
 139 static const uschar posix_name_lengths[] = {
 140   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 141
 142 /* Table of class bit maps for each POSIX class. Each class is formed from a
 143 base map, with an optional addition or removal of another map. Then, for some
 144 classes, there is some additional tweaking: for [:blank:] the vertical space
 145 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 146 character is removed. The triples in the table consist of the base map offset,
 147 second map offset or -1 if no second map, and a non-negative value for map
 148 addition or a negative value for map subtraction (if there are two maps). The
 149 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 150 remove vertical space characters, 2 => remove underscore. */
 151
 152 static const int posix_class_maps[] = {
 153   cbit_word,  cbit_digit, -2,             /* alpha */
 154   cbit_lower, -1,          0,             /* lower */
 155   cbit_upper, -1,          0,             /* upper */
 156   cbit_word,  -1,          2,             /* alnum - word without underscore */
 157   cbit_print, cbit_cntrl,  0,             /* ascii */
 158   cbit_space, -1,          1,             /* blank - a GNU extension */
 159   cbit_cntrl, -1,          0,             /* cntrl */
 160   cbit_digit, -1,          0,             /* digit */
 161   cbit_graph, -1,          0,             /* graph */
 162   cbit_print, -1,          0,             /* print */
 163   cbit_punct, -1,          0,             /* punct */
 164   cbit_space, -1,          0,             /* space */
 165   cbit_word,  -1,          0,             /* word - a Perl extension */
 166   cbit_xdigit,-1,          0              /* xdigit */
 167 };
 168
 169
 170 #define STRING(a)  # a
 171 #define XSTRING(s) STRING(s)
 172
 173 /* The texts of compile-time error messages. These are "char *" because they
 174 are passed to the outside world. Do not ever re-use any error number, because
 175 they are documented. Always add a new error instead. Messages marked DEAD below
 176 are no longer used. */
 177
 178 static const char *error_texts[] = {
 179   "no error",
 180   "\\ at end of pattern",
 181   "\\c at end of pattern",
 182   "unrecognized character follows \\",
 183   "numbers out of order in {} quantifier",
 184   /* 5 */
 185   "number too big in {} quantifier",
 186   "missing terminating ] for character class",
 187   "invalid escape sequence in character class",
 188   "range out of order in character class",
 189   "nothing to repeat",
 190   /* 10 */
 191   "operand of unlimited repeat could match the empty string",  /** DEAD **/
 192   "internal error: unexpected repeat",
 193   "unrecognized character after (?",
 194   "POSIX named classes are supported only within a class",
 195   "missing )",
 196   /* 15 */
 197   "reference to non-existent subpattern",
 198   "erroffset passed as NULL",
 199   "unknown option bit(s) set",
 200   "missing ) after comment",
 201   "parentheses nested too deeply",  /** DEAD **/
 202   /* 20 */
 203   "regular expression too large",
 204   "failed to get memory",
 205   "unmatched parentheses",
 206   "internal error: code overflow",
 207   "unrecognized character after (?<",
 208   /* 25 */
 209   "lookbehind assertion is not fixed length",
 210   "malformed number or name after (?(",
 211   "conditional group contains more than two branches",
 212   "assertion expected after (?(",
 213   "(?R or (?digits must be followed by )",
 214   /* 30 */
 215   "unknown POSIX class name",
 216   "POSIX collating elements are not supported",
 217   "this version of PCRE is not compiled with PCRE_UTF8 support",
 218   "spare error",  /** DEAD **/
 219   "character value in \\x{...} sequence is too large",
 220   /* 35 */
 221   "invalid condition (?(0)",
 222   "\\C not allowed in lookbehind assertion",
 223   "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
 224   "number after (?C is > 255",
 225   "closing ) for (?C expected",
 226   /* 40 */
 227   "recursive call could loop indefinitely",
 228   "unrecognized character after (?P",
 229   "syntax error in subpattern name (missing terminator)",
 230   "two named subpatterns have the same name",
 231   "invalid UTF-8 string",
 232   /* 45 */
 233   "support for \\P, \\p, and \\X has not been compiled",
 234   "malformed \\P or \\p sequence",
 235   "unknown property name after \\P or \\p",
 236   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
 237   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
 238   /* 50 */
 239   "repeated subpattern is too long",
 240   "octal value is greater than \\377 (not in UTF-8 mode)",
 241   "internal error: overran compiling workspace",
 242   "internal error: previously-checked referenced subpattern not found",
 243   "DEFINE group contains more than one branch",
 244   /* 55 */
 245   "repeating a DEFINE group is not allowed",
 246   "inconsistent NEWLINE options",
 247   "\\g is not followed by an (optionally braced) non-zero number"
 248 };
 249
 250
 251 /* Table to identify digits and hex digits. This is used when compiling
 252 patterns. Note that the tables in chartables are dependent on the locale, and
 253 may mark arbitrary characters as digits - but the PCRE compiling code expects
 254 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 255 a private table here. It costs 256 bytes, but it is a lot faster than doing
 256 character value tests (at least in some simple cases I timed), and in some
 257 applications one wants PCRE to compile efficiently as well as match
 258 efficiently.
 259
 260 For convenience, we use the same bit definitions as in chartables:
 261
 262   0x04   decimal digit
 263   0x08   hexadecimal digit
 264
 265 Then we can use ctype_digit and ctype_xdigit in the code. */
 266
 267 #if !EBCDIC    /* This is the "normal" case, for ASCII systems */
 268 static const unsigned char digitab[] =
 269   {
 270   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 271   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 272   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 273   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 274   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 275   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 276   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 277   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 278   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 279   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 280   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 281   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 282   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 283   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 284   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 285   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 286   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 287   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 288   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 289   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 290   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 291   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 292   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 293   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 294   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 295   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 296   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 297   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 298   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 299   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 300   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 301   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 302
 303 #else          /* This is the "abnormal" case, for EBCDIC systems */
 304 static const unsigned char digitab[] =
 305   {
 306   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 307   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 308   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 309   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 310   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 311   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 312   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 313   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 314   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 315   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 316   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 317   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- ¬     */
 318   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 319   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 320   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 321   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 322   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 323   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 324   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 325   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 326   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 327   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 328   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 329   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 330   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 331   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 332   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 333   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 334   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 335   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 336   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 337   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 338
 339 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
 340   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 341   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 342   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 343   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 344   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 345   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 346   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 347   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 348   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 349   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 350   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 351   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- ¬  */
 352   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 353   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 354   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 355   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 356   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 357   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 358   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 359   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 360   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 361   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 362   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 363   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 364   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 365   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 366   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 367   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 368   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 369   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 370   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 371   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 372 #endif
 373
 374
 375 /* Definition to allow mutual recursion */
 376
 377 static BOOL
 378   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
 379     int *, branch_chain *, compile_data *, int *);
 380
 381
 382
 383 /*************************************************
 384 *            Handle escapes                      *
 385 *************************************************/
 386
 387 /* This function is called when a \ has been encountered. It either returns a
 388 positive value for a simple escape such as \n, or a negative value which
 389 encodes one of the more complicated things such as \d. A backreference to group
 390 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
 391 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
 392 ptr is pointing at the \. On exit, it is on the final character of the escape
 393 sequence.
 394
 395 Arguments:
 396   ptrptr         points to the pattern position pointer
 397   errorcodeptr   points to the errorcode variable
 398   bracount       number of previous extracting brackets
 399   options        the options bits
 400   isclass        TRUE if inside a character class
 401
 402 Returns:         zero or positive => a data character
 403                  negative => a special escape sequence
 404                  on error, errorptr is set
 405 */
 406
 407 static int
 408 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
 409   int options, BOOL isclass)
 410 {
 411 BOOL utf8 = (options & PCRE_UTF8) != 0;
 412 const uschar *ptr = *ptrptr + 1;
 413 int c, i;
 414
 415 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 416 ptr--;                            /* Set pointer back to the last byte */
 417
 418 /* If backslash is at the end of the pattern, it's an error. */
 419
 420 if (c == 0) *errorcodeptr = ERR1;
 421
 422 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
 423 a table. A non-zero result is something that can be returned immediately.
 424 Otherwise further processing may be required. */
 425
 426 #if !EBCDIC    /* ASCII coding */
 427 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
 428 else if ((i = escapes[c - '0']) != 0) c = i;
 429
 430 #else          /* EBCDIC coding */
 431 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
 432 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 433 #endif
 434
 435 /* Escapes that need further processing, or are illegal. */
 436
 437 else
 438   {
 439   const uschar *oldptr;
 440   BOOL braced, negated;
 441
 442   switch (c)
 443     {
 444     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 445     error. */
 446
 447     case 'l':
 448     case 'L':
 449     case 'N':
 450     case 'u':
 451     case 'U':
 452     *errorcodeptr = ERR37;
 453     break;
 454
 455     /* \g must be followed by a number, either plain or braced. If positive, it
 456     is an absolute backreference. If negative, it is a relative backreference.
 457     This is a Perl 5.10 feature. */
 458
 459     case 'g':
 460     if (ptr[1] == '{')
 461       {
 462       braced = TRUE;
 463       ptr++;
 464       }
 465     else braced = FALSE;
 466
 467     if (ptr[1] == '-')
 468       {
 469       negated = TRUE;
 470       ptr++;
 471       }
 472     else negated = FALSE;
 473
 474     c = 0;
 475     while ((digitab[ptr[1]] & ctype_digit) != 0)
 476       c = c * 10 + *(++ptr) - '0';
 477
 478     if (c == 0 || (braced && *(++ptr) != '}'))
 479       {
 480       *errorcodeptr = ERR57;
 481       return 0;
 482       }
 483
 484     if (negated)
 485       {
 486       if (c > bracount)
 487         {
 488         *errorcodeptr = ERR15;
 489         return 0;
 490         }
 491       c = bracount - (c - 1);
 492       }
 493
 494     c = -(ESC_REF + c);
 495     break;
 496
 497     /* The handling of escape sequences consisting of a string of digits
 498     starting with one that is not zero is not straightforward. By experiment,
 499     the way Perl works seems to be as follows:
 500
 501     Outside a character class, the digits are read as a decimal number. If the
 502     number is less than 10, or if there are that many previous extracting
 503     left brackets, then it is a back reference. Otherwise, up to three octal
 504     digits are read to form an escaped byte. Thus \123 is likely to be octal
 505     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 506     value is greater than 377, the least significant 8 bits are taken. Inside a
 507     character class, \ followed by a digit is always an octal number. */
 508
 509     case '1': case '2': case '3': case '4': case '5':
 510     case '6': case '7': case '8': case '9':
 511
 512     if (!isclass)
 513       {
 514       oldptr = ptr;
 515       c -= '0';
 516       while ((digitab[ptr[1]] & ctype_digit) != 0)
 517         c = c * 10 + *(++ptr) - '0';
 518       if (c < 10 || c <= bracount)
 519         {
 520         c = -(ESC_REF + c);
 521         break;
 522         }
 523       ptr = oldptr;      /* Put the pointer back and fall through */
 524       }
 525
 526     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 527     generates a binary zero byte and treats the digit as a following literal.
 528     Thus we have to pull back the pointer by one. */
 529
 530     if ((c = *ptr) >= '8')
 531       {
 532       ptr--;
 533       c = 0;
 534       break;
 535       }
 536
 537     /* \0 always starts an octal number, but we may drop through to here with a
 538     larger first octal digit. The original code used just to take the least
 539     significant 8 bits of octal numbers (I think this is what early Perls used
 540     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
 541     than 3 octal digits. */
 542
 543     case '0':
 544     c -= '0';
 545     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
 546         c = c * 8 + *(++ptr) - '0';
 547     if (!utf8 && c > 255) *errorcodeptr = ERR51;
 548     break;
 549
 550     /* \x is complicated. \x{ddd} is a character number which can be greater
 551     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
 552     treated as a data character. */
 553
 554     case 'x':
 555     if (ptr[1] == '{')
 556       {
 557       const uschar *pt = ptr + 2;
 558       int count = 0;
 559
 560       c = 0;
 561       while ((digitab[*pt] & ctype_xdigit) != 0)
 562         {
 563         register int cc = *pt++;
 564         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
 565         count++;
 566
 567 #if !EBCDIC    /* ASCII coding */
 568         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
 569         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
 570 #else          /* EBCDIC coding */
 571         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
 572         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
 573 #endif
 574         }
 575
 576       if (*pt == '}')
 577         {
 578         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
 579         ptr = pt;
 580         break;
 581         }
 582
 583       /* If the sequence of hex digits does not end with '}', then we don't
 584       recognize this construct; fall through to the normal \x handling. */
 585       }
 586
 587     /* Read just a single-byte hex-defined char */
 588
 589     c = 0;
 590     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
 591       {
 592       int cc;                               /* Some compilers don't like ++ */
 593       cc = *(++ptr);                        /* in initializers */
 594 #if !EBCDIC    /* ASCII coding */
 595       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
 596       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
 597 #else          /* EBCDIC coding */
 598       if (cc <= 'z') cc += 64;              /* Convert to upper case */
 599       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
 600 #endif
 601       }
 602     break;
 603
 604     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 605     This coding is ASCII-specific, but then the whole concept of \cx is
 606     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 607
 608     case 'c':
 609     c = *(++ptr);
 610     if (c == 0)
 611       {
 612       *errorcodeptr = ERR2;
 613       return 0;
 614       }
 615
 616 #if !EBCDIC    /* ASCII coding */
 617     if (c >= 'a' && c <= 'z') c -= 32;
 618     c ^= 0x40;
 619 #else          /* EBCDIC coding */
 620     if (c >= 'a' && c <= 'z') c += 64;
 621     c ^= 0xC0;
 622 #endif
 623     break;
 624
 625     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 626     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
 627     for Perl compatibility, it is a literal. This code looks a bit odd, but
 628     there used to be some cases other than the default, and there may be again
 629     in future, so I haven't "optimized" it. */
 630
 631     default:
 632     if ((options & PCRE_EXTRA) != 0) switch(c)
 633       {
 634       default:
 635       *errorcodeptr = ERR3;
 636       break;
 637       }
 638     break;
 639     }
 640   }
 641
 642 *ptrptr = ptr;
 643 return c;
 644 }
 645
 646
 647
 648 #ifdef SUPPORT_UCP
 649 /*************************************************
 650 *               Handle \P and \p                 *
 651 *************************************************/
 652
 653 /* This function is called after \P or \p has been encountered, provided that
 654 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 655 pointing at the P or p. On exit, it is pointing at the final character of the
 656 escape sequence.
 657
 658 Argument:
 659   ptrptr         points to the pattern position pointer
 660   negptr         points to a boolean that is set TRUE for negation else FALSE
 661   dptr           points to an int that is set to the detailed property value
 662   errorcodeptr   points to the error code variable
 663
 664 Returns:         type value from ucp_type_table, or -1 for an invalid type
 665 */
 666
 667 static int
 668 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
 669 {
 670 int c, i, bot, top;
 671 const uschar *ptr = *ptrptr;
 672 char name[32];
 673
 674 c = *(++ptr);
 675 if (c == 0) goto ERROR_RETURN;
 676
 677 *negptr = FALSE;
 678
 679 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 680 negation. */
 681
 682 if (c == '{')
 683   {
 684   if (ptr[1] == '^')
 685     {
 686     *negptr = TRUE;
 687     ptr++;
 688     }
 689   for (i = 0; i < sizeof(name) - 1; i++)
 690     {
 691     c = *(++ptr);
 692     if (c == 0) goto ERROR_RETURN;
 693     if (c == '}') break;
 694     name[i] = c;
 695     }
 696   if (c !='}') goto ERROR_RETURN;
 697   name[i] = 0;
 698   }
 699
 700 /* Otherwise there is just one following character */
 701
 702 else
 703   {
 704   name[0] = c;
 705   name[1] = 0;
 706   }
 707
 708 *ptrptr = ptr;
 709
 710 /* Search for a recognized property name using binary chop */
 711
 712 bot = 0;
 713 top = _pcre_utt_size;
 714
 715 while (bot < top)
 716   {
 717   i = (bot + top) >> 1;
 718   c = strcmp(name, _pcre_utt[i].name);
 719   if (c == 0)
 720     {
 721     *dptr = _pcre_utt[i].value;
 722     return _pcre_utt[i].type;
 723     }
 724   if (c > 0) bot = i + 1; else top = i;
 725   }
 726
 727 *errorcodeptr = ERR47;
 728 *ptrptr = ptr;
 729 return -1;
 730
 731 ERROR_RETURN:
 732 *errorcodeptr = ERR46;
 733 *ptrptr = ptr;
 734 return -1;
 735 }
 736 #endif
 737
 738
 739
 740
 741 /*************************************************
 742 *            Check for counted repeat            *
 743 *************************************************/
 744
 745 /* This function is called when a '{' is encountered in a place where it might
 746 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 747 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 748 where the ddds are digits.
 749
 750 Arguments:
 751   p         pointer to the first char after '{'
 752
 753 Returns:    TRUE or FALSE
 754 */
 755
 756 static BOOL
 757 is_counted_repeat(const uschar *p)
 758 {
 759 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 760 while ((digitab[*p] & ctype_digit) != 0) p++;
 761 if (*p == '}') return TRUE;
 762
 763 if (*p++ != ',') return FALSE;
 764 if (*p == '}') return TRUE;
 765
 766 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 767 while ((digitab[*p] & ctype_digit) != 0) p++;
 768
 769 return (*p == '}');
 770 }
 771
 772
 773
 774 /*************************************************
 775 *         Read repeat counts                     *
 776 *************************************************/
 777
 778 /* Read an item of the form {n,m} and return the values. This is called only
 779 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 780 so the syntax is guaranteed to be correct, but we need to check the values.
 781
 782 Arguments:
 783   p              pointer to first char after '{'
 784   minp           pointer to int for min
 785   maxp           pointer to int for max
 786                  returned as -1 if no max
 787   errorcodeptr   points to error code variable
 788
 789 Returns:         pointer to '}' on success;
 790                  current ptr on error, with errorcodeptr set non-zero
 791 */
 792
 793 static const uschar *
 794 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
 795 {
 796 int min = 0;
 797 int max = -1;
 798
 799 /* Read the minimum value and do a paranoid check: a negative value indicates
 800 an integer overflow. */
 801
 802 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
 803 if (min < 0 || min > 65535)
 804   {
 805   *errorcodeptr = ERR5;
 806   return p;
 807   }
 808
 809 /* Read the maximum value if there is one, and again do a paranoid on its size.
 810 Also, max must not be less than min. */
 811
 812 if (*p == '}') max = min; else
 813   {
 814   if (*(++p) != '}')
 815     {
 816     max = 0;
 817     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
 818     if (max < 0 || max > 65535)
 819       {
 820       *errorcodeptr = ERR5;
 821       return p;
 822       }
 823     if (max < min)
 824       {
 825       *errorcodeptr = ERR4;
 826       return p;
 827       }
 828     }
 829   }
 830
 831 /* Fill in the required variables, and pass back the pointer to the terminating
 832 '}'. */
 833
 834 *minp = min;
 835 *maxp = max;
 836 return p;
 837 }
 838
 839
 840
 841 /*************************************************
 842 *       Find forward referenced subpattern       *
 843 *************************************************/
 844
 845 /* This function scans along a pattern's text looking for capturing
 846 subpatterns, and counting them. If it finds a named pattern that matches the
 847 name it is given, it returns its number. Alternatively, if the name is NULL, it
 848 returns when it reaches a given numbered subpattern. This is used for forward
 849 references to subpatterns. We know that if (?P< is encountered, the name will
 850 be terminated by '>' because that is checked in the first pass.
 851
 852 Arguments:
 853   ptr          current position in the pattern
 854   count        current count of capturing parens so far encountered
 855   name         name to seek, or NULL if seeking a numbered subpattern
 856   lorn         name length, or subpattern number if name is NULL
 857   xmode        TRUE if we are in /x mode
 858
 859 Returns:       the number of the named subpattern, or -1 if not found
 860 */
 861
 862 static int
 863 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
 864   BOOL xmode)
 865 {
 866 const uschar *thisname;
 867
 868 for (; *ptr != 0; ptr++)
 869   {
 870   int term;
 871
 872   /* Skip over backslashed characters and also entire \Q...\E */
 873
 874   if (*ptr == '\\')
 875     {
 876     if (*(++ptr) == 0) return -1;
 877     if (*ptr == 'Q') for (;;)
 878       {
 879       while (*(++ptr) != 0 && *ptr != '\\');
 880       if (*ptr == 0) return -1;
 881       if (*(++ptr) == 'E') break;
 882       }
 883     continue;
 884     }
 885
 886   /* Skip over character classes */
 887
 888   if (*ptr == '[')
 889     {
 890     while (*(++ptr) != ']')
 891       {
 892       if (*ptr == '\\')
 893         {
 894         if (*(++ptr) == 0) return -1;
 895         if (*ptr == 'Q') for (;;)
 896           {
 897           while (*(++ptr) != 0 && *ptr != '\\');
 898           if (*ptr == 0) return -1;
 899           if (*(++ptr) == 'E') break;
 900           }
 901         continue;
 902         }
 903       }
 904     continue;
 905     }
 906
 907   /* Skip comments in /x mode */
 908
 909   if (xmode && *ptr == '#')
 910     {
 911     while (*(++ptr) != 0 && *ptr != '\n');
 912     if (*ptr == 0) return -1;
 913     continue;
 914     }
 915
 916   /* An opening parens must now be a real metacharacter */
 917
 918   if (*ptr != '(') continue;
 919   if (ptr[1] != '?')
 920     {
 921     count++;
 922     if (name == NULL && count == lorn) return count;
 923     continue;
 924     }
 925
 926   ptr += 2;
 927   if (*ptr == 'P') ptr++;                      /* Allow optional P */
 928
 929   /* We have to disambiguate (?<! and (?<= from (?<name> */
 930
 931   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
 932        *ptr != '\'')
 933     continue;
 934
 935   count++;
 936
 937   if (name == NULL && count == lorn) return count;
 938   term = *ptr++;
 939   if (term == '<') term = '>';
 940   thisname = ptr;
 941   while (*ptr != term) ptr++;
 942   if (name != NULL && lorn == ptr - thisname &&
 943       strncmp((const char *)name, (const char *)thisname, lorn) == 0)
 944     return count;
 945   }
 946
 947 return -1;
 948 }
 949
 950
 951
 952 /*************************************************
 953 *      Find first significant op code            *
 954 *************************************************/
 955
 956 /* This is called by several functions that scan a compiled expression looking
 957 for a fixed first character, or an anchoring op code etc. It skips over things
 958 that do not influence this. For some calls, a change of option is important.
 959 For some calls, it makes sense to skip negative forward and all backward
 960 assertions, and also the \b assertion; for others it does not.
 961
 962 Arguments:
 963   code         pointer to the start of the group
 964   options      pointer to external options
 965   optbit       the option bit whose changing is significant, or
 966                  zero if none are
 967   skipassert   TRUE if certain assertions are to be skipped
 968
 969 Returns:       pointer to the first significant opcode
 970 */
 971
 972 static const uschar*
 973 first_significant_code(const uschar *code, int *options, int optbit,
 974   BOOL skipassert)
 975 {
 976 for (;;)
 977   {
 978   switch ((int)*code)
 979     {
 980     case OP_OPT:
 981     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
 982       *options = (int)code[1];
 983     code += 2;
 984     break;
 985
 986     case OP_ASSERT_NOT:
 987     case OP_ASSERTBACK:
 988     case OP_ASSERTBACK_NOT:
 989     if (!skipassert) return code;
 990     do code += GET(code, 1); while (*code == OP_ALT);
 991     code += _pcre_OP_lengths[*code];
 992     break;
 993
 994     case OP_WORD_BOUNDARY:
 995     case OP_NOT_WORD_BOUNDARY:
 996     if (!skipassert) return code;
 997     /* Fall through */
 998
 999     case OP_CALLOUT:
1000     case OP_CREF:
1001     case OP_RREF:
1002     case OP_DEF:
1003     code += _pcre_OP_lengths[*code];
1004     break;
1005
1006     default:
1007     return code;
1008     }
1009   }
1010 /* Control never reaches here */
1011 }
1012
1013
1014
1015
1016 /*************************************************
1017 *        Find the fixed length of a pattern      *
1018 *************************************************/
1019
1020 /* Scan a pattern and compute the fixed length of subject that will match it,
1021 if the length is fixed. This is needed for dealing with backward assertions.
1022 In UTF8 mode, the result is in characters rather than bytes.
1023
1024 Arguments:
1025   code     points to the start of the pattern (the bracket)
1026   options  the compiling options
1027
1028 Returns:   the fixed length, or -1 if there is no fixed length,
1029              or -2 if \C was encountered
1030 */
1031
1032 static int
1033 find_fixedlength(uschar *code, int options)
1034 {
1035 int length = -1;
1036
1037 register int branchlength = 0;
1038 register uschar *cc = code + 1 + LINK_SIZE;
1039
1040 /* Scan along the opcodes for this branch. If we get to the end of the
1041 branch, check the length against that of the other branches. */
1042
1043 for (;;)
1044   {
1045   int d;
1046   register int op = *cc;
1047
1048   switch (op)
1049     {
1050     case OP_CBRA:
1051     case OP_BRA:
1052     case OP_ONCE:
1053     case OP_COND:
1054     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1055     if (d < 0) return d;
1056     branchlength += d;
1057     do cc += GET(cc, 1); while (*cc == OP_ALT);
1058     cc += 1 + LINK_SIZE;
1059     break;
1060
1061     /* Reached end of a branch; if it's a ket it is the end of a nested
1062     call. If it's ALT it is an alternation in a nested call. If it is
1063     END it's the end of the outer call. All can be handled by the same code. */
1064
1065     case OP_ALT:
1066     case OP_KET:
1067     case OP_KETRMAX:
1068     case OP_KETRMIN:
1069     case OP_END:
1070     if (length < 0) length = branchlength;
1071       else if (length != branchlength) return -1;
1072     if (*cc != OP_ALT) return length;
1073     cc += 1 + LINK_SIZE;
1074     branchlength = 0;
1075     break;
1076
1077     /* Skip over assertive subpatterns */
1078
1079     case OP_ASSERT:
1080     case OP_ASSERT_NOT:
1081     case OP_ASSERTBACK:
1082     case OP_ASSERTBACK_NOT:
1083     do cc += GET(cc, 1); while (*cc == OP_ALT);
1084     /* Fall through */
1085
1086     /* Skip over things that don't match chars */
1087
1088     case OP_REVERSE:
1089     case OP_CREF:
1090     case OP_RREF:
1091     case OP_DEF:
1092     case OP_OPT:
1093     case OP_CALLOUT:
1094     case OP_SOD:
1095     case OP_SOM:
1096     case OP_EOD:
1097     case OP_EODN:
1098     case OP_CIRC:
1099     case OP_DOLL:
1100     case OP_NOT_WORD_BOUNDARY:
1101     case OP_WORD_BOUNDARY:
1102     cc += _pcre_OP_lengths[*cc];
1103     break;
1104
1105     /* Handle literal characters */
1106
1107     case OP_CHAR:
1108     case OP_CHARNC:
1109     case OP_NOT:
1110     branchlength++;
1111     cc += 2;
1112 #ifdef SUPPORT_UTF8
1113     if ((options & PCRE_UTF8) != 0)
1114       {
1115       while ((*cc & 0xc0) == 0x80) cc++;
1116       }
1117 #endif
1118     break;
1119
1120     /* Handle exact repetitions. The count is already in characters, but we
1121     need to skip over a multibyte character in UTF8 mode.  */
1122
1123     case OP_EXACT:
1124     branchlength += GET2(cc,1);
1125     cc += 4;
1126 #ifdef SUPPORT_UTF8
1127     if ((options & PCRE_UTF8) != 0)
1128       {
1129       while((*cc & 0x80) == 0x80) cc++;
1130       }
1131 #endif
1132     break;
1133
1134     case OP_TYPEEXACT:
1135     branchlength += GET2(cc,1);
1136     cc += 4;
1137     break;
1138
1139     /* Handle single-char matchers */
1140
1141     case OP_PROP:
1142     case OP_NOTPROP:
1143     cc += 2;
1144     /* Fall through */
1145
1146     case OP_NOT_DIGIT:
1147     case OP_DIGIT:
1148     case OP_NOT_WHITESPACE:
1149     case OP_WHITESPACE:
1150     case OP_NOT_WORDCHAR:
1151     case OP_WORDCHAR:
1152     case OP_ANY:
1153     branchlength++;
1154     cc++;
1155     break;
1156
1157     /* The single-byte matcher isn't allowed */
1158
1159     case OP_ANYBYTE:
1160     return -2;
1161
1162     /* Check a class for variable quantification */
1163
1164 #ifdef SUPPORT_UTF8
1165     case OP_XCLASS:
1166     cc += GET(cc, 1) - 33;
1167     /* Fall through */
1168 #endif
1169
1170     case OP_CLASS:
1171     case OP_NCLASS:
1172     cc += 33;
1173
1174     switch (*cc)
1175       {
1176       case OP_CRSTAR:
1177       case OP_CRMINSTAR:
1178       case OP_CRQUERY:
1179       case OP_CRMINQUERY:
1180       return -1;
1181
1182       case OP_CRRANGE:
1183       case OP_CRMINRANGE:
1184       if (GET2(cc,1) != GET2(cc,3)) return -1;
1185       branchlength += GET2(cc,1);
1186       cc += 5;
1187       break;
1188
1189       default:
1190       branchlength++;
1191       }
1192     break;
1193
1194     /* Anything else is variable length */
1195
1196     default:
1197     return -1;
1198     }
1199   }
1200 /* Control never gets here */
1201 }
1202
1203
1204
1205
1206 /*************************************************
1207 *    Scan compiled regex for numbered bracket    *
1208 *************************************************/
1209
1210 /* This little function scans through a compiled pattern until it finds a
1211 capturing bracket with the given number.
1212
1213 Arguments:
1214   code        points to start of expression
1215   utf8        TRUE in UTF-8 mode
1216   number      the required bracket number
1217
1218 Returns:      pointer to the opcode for the bracket, or NULL if not found
1219 */
1220
1221 static const uschar *
1222 find_bracket(const uschar *code, BOOL utf8, int number)
1223 {
1224 for (;;)
1225   {
1226   register int c = *code;
1227   if (c == OP_END) return NULL;
1228
1229   /* XCLASS is used for classes that cannot be represented just by a bit
1230   map. This includes negated single high-valued characters. The length in
1231   the table is zero; the actual length is stored in the compiled code. */
1232
1233   if (c == OP_XCLASS) code += GET(code, 1);
1234
1235   /* Handle capturing bracket */
1236
1237   else if (c == OP_CBRA)
1238     {
1239     int n = GET2(code, 1+LINK_SIZE);
1240     if (n == number) return (uschar *)code;
1241     code += _pcre_OP_lengths[c];
1242     }
1243
1244   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1245   a multi-byte character. The length in the table is a minimum, so we have to
1246   arrange to skip the extra bytes. */
1247
1248   else
1249     {
1250     code += _pcre_OP_lengths[c];
1251     if (utf8) switch(c)
1252       {
1253       case OP_CHAR:
1254       case OP_CHARNC:
1255       case OP_EXACT:
1256       case OP_UPTO:
1257       case OP_MINUPTO:
1258       case OP_POSUPTO:
1259       case OP_STAR:
1260       case OP_MINSTAR:
1261       case OP_POSSTAR:
1262       case OP_PLUS:
1263       case OP_MINPLUS:
1264       case OP_POSPLUS:
1265       case OP_QUERY:
1266       case OP_MINQUERY:
1267       case OP_POSQUERY:
1268       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1269       break;
1270       }
1271     }
1272   }
1273 }
1274
1275
1276
1277 /*************************************************
1278 *   Scan compiled regex for recursion reference  *
1279 *************************************************/
1280
1281 /* This little function scans through a compiled pattern until it finds an
1282 instance of OP_RECURSE.
1283
1284 Arguments:
1285   code        points to start of expression
1286   utf8        TRUE in UTF-8 mode
1287
1288 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1289 */
1290
1291 static const uschar *
1292 find_recurse(const uschar *code, BOOL utf8)
1293 {
1294 for (;;)
1295   {
1296   register int c = *code;
1297   if (c == OP_END) return NULL;
1298   if (c == OP_RECURSE) return code;
1299
1300   /* XCLASS is used for classes that cannot be represented just by a bit
1301   map. This includes negated single high-valued characters. The length in
1302   the table is zero; the actual length is stored in the compiled code. */
1303
1304   if (c == OP_XCLASS) code += GET(code, 1);
1305
1306   /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1307   that are followed by a character may be followed by a multi-byte character.
1308   The length in the table is a minimum, so we have to arrange to skip the extra
1309   bytes. */
1310
1311   else
1312     {
1313     code += _pcre_OP_lengths[c];
1314     if (utf8) switch(c)
1315       {
1316       case OP_CHAR:
1317       case OP_CHARNC:
1318       case OP_EXACT:
1319       case OP_UPTO:
1320       case OP_MINUPTO:
1321       case OP_POSUPTO:
1322       case OP_STAR:
1323       case OP_MINSTAR:
1324       case OP_POSSTAR:
1325       case OP_PLUS:
1326       case OP_MINPLUS:
1327       case OP_POSPLUS:
1328       case OP_QUERY:
1329       case OP_MINQUERY:
1330       case OP_POSQUERY:
1331       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1332       break;
1333       }
1334     }
1335   }
1336 }
1337
1338
1339
1340 /*************************************************
1341 *    Scan compiled branch for non-emptiness      *
1342 *************************************************/
1343
1344 /* This function scans through a branch of a compiled pattern to see whether it
1345 can match the empty string or not. It is called from could_be_empty()
1346 below and from compile_branch() when checking for an unlimited repeat of a
1347 group that can match nothing. Note that first_significant_code() skips over
1348 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1349 struck an inner bracket whose current branch will already have been scanned.
1350
1351 Arguments:
1352   code        points to start of search
1353   endcode     points to where to stop
1354   utf8        TRUE if in UTF8 mode
1355
1356 Returns:      TRUE if what is matched could be empty
1357 */
1358
1359 static BOOL
1360 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1361 {
1362 register int c;
1363 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1364      code < endcode;
1365      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1366   {
1367   const uschar *ccode;
1368
1369   c = *code;
1370
1371   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1372     {
1373     BOOL empty_branch;
1374     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1375
1376     /* Scan a closed bracket */
1377
1378     empty_branch = FALSE;
1379     do
1380       {
1381       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1382         empty_branch = TRUE;
1383       code += GET(code, 1);
1384       }
1385     while (*code == OP_ALT);
1386     if (!empty_branch) return FALSE;   /* All branches are non-empty */
1387
1388     /* Move past the KET and fudge things so that the increment in the "for"
1389     above has no effect. */
1390
1391     c = OP_END;
1392     code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1393     continue;
1394     }
1395
1396   /* Handle the other opcodes */
1397
1398   switch (c)
1399     {
1400     /* Check for quantifiers after a class */
1401
1402 #ifdef SUPPORT_UTF8
1403     case OP_XCLASS:
1404     ccode = code + GET(code, 1);
1405     goto CHECK_CLASS_REPEAT;
1406 #endif
1407
1408     case OP_CLASS:
1409     case OP_NCLASS:
1410     ccode = code + 33;
1411
1412 #ifdef SUPPORT_UTF8
1413     CHECK_CLASS_REPEAT:
1414 #endif
1415
1416     switch (*ccode)
1417       {
1418       case OP_CRSTAR:            /* These could be empty; continue */
1419       case OP_CRMINSTAR:
1420       case OP_CRQUERY:
1421       case OP_CRMINQUERY:
1422       break;
1423
1424       default:                   /* Non-repeat => class must match */
1425       case OP_CRPLUS:            /* These repeats aren't empty */
1426       case OP_CRMINPLUS:
1427       return FALSE;
1428
1429       case OP_CRRANGE:
1430       case OP_CRMINRANGE:
1431       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1432       break;
1433       }
1434     break;
1435
1436     /* Opcodes that must match a character */
1437
1438     case OP_PROP:
1439     case OP_NOTPROP:
1440     case OP_EXTUNI:
1441     case OP_NOT_DIGIT:
1442     case OP_DIGIT:
1443     case OP_NOT_WHITESPACE:
1444     case OP_WHITESPACE:
1445     case OP_NOT_WORDCHAR:
1446     case OP_WORDCHAR:
1447     case OP_ANY:
1448     case OP_ANYBYTE:
1449     case OP_CHAR:
1450     case OP_CHARNC:
1451     case OP_NOT:
1452     case OP_PLUS:
1453     case OP_MINPLUS:
1454     case OP_POSPLUS:
1455     case OP_EXACT:
1456     case OP_NOTPLUS:
1457     case OP_NOTMINPLUS:
1458     case OP_NOTPOSPLUS:
1459     case OP_NOTEXACT:
1460     case OP_TYPEPLUS:
1461     case OP_TYPEMINPLUS:
1462     case OP_TYPEPOSPLUS:
1463     case OP_TYPEEXACT:
1464     return FALSE;
1465
1466     /* End of branch */
1467
1468     case OP_KET:
1469     case OP_KETRMAX:
1470     case OP_KETRMIN:
1471     case OP_ALT:
1472     return TRUE;
1473
1474     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1475     MINUPTO, and POSUPTO may be followed by a multibyte character */
1476
1477 #ifdef SUPPORT_UTF8
1478     case OP_STAR:
1479     case OP_MINSTAR:
1480     case OP_POSSTAR:
1481     case OP_QUERY:
1482     case OP_MINQUERY:
1483     case OP_POSQUERY:
1484     case OP_UPTO:
1485     case OP_MINUPTO:
1486     case OP_POSUPTO:
1487     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1488     break;
1489 #endif
1490     }
1491   }
1492
1493 return TRUE;
1494 }
1495
1496
1497
1498 /*************************************************
1499 *    Scan compiled regex for non-emptiness       *
1500 *************************************************/
1501
1502 /* This function is called to check for left recursive calls. We want to check
1503 the current branch of the current pattern to see if it could match the empty
1504 string. If it could, we must look outwards for branches at other levels,
1505 stopping when we pass beyond the bracket which is the subject of the recursion.
1506
1507 Arguments:
1508   code        points to start of the recursion
1509   endcode     points to where to stop (current RECURSE item)
1510   bcptr       points to the chain of current (unclosed) branch starts
1511   utf8        TRUE if in UTF-8 mode
1512
1513 Returns:      TRUE if what is matched could be empty
1514 */
1515
1516 static BOOL
1517 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1518   BOOL utf8)
1519 {
1520 while (bcptr != NULL && bcptr->current >= code)
1521   {
1522   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1523   bcptr = bcptr->outer;
1524   }
1525 return TRUE;
1526 }
1527
1528
1529
1530 /*************************************************
1531 *           Check for POSIX class syntax         *
1532 *************************************************/
1533
1534 /* This function is called when the sequence "[:" or "[." or "[=" is
1535 encountered in a character class. It checks whether this is followed by an
1536 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1537 ".]" or "=]".
1538
1539 Argument:
1540   ptr      pointer to the initial [
1541   endptr   where to return the end pointer
1542   cd       pointer to compile data
1543
1544 Returns:   TRUE or FALSE
1545 */
1546
1547 static BOOL
1548 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1549 {
1550 int terminator;          /* Don't combine these lines; the Solaris cc */
1551 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1552 if (*(++ptr) == '^') ptr++;
1553 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1554 if (*ptr == terminator && ptr[1] == ']')
1555   {
1556   *endptr = ptr;
1557   return TRUE;
1558   }
1559 return FALSE;
1560 }
1561
1562
1563
1564
1565 /*************************************************
1566 *          Check POSIX class name                *
1567 *************************************************/
1568
1569 /* This function is called to check the name given in a POSIX-style class entry
1570 such as [:alnum:].
1571
1572 Arguments:
1573   ptr        points to the first letter
1574   len        the length of the name
1575
1576 Returns:     a value representing the name, or -1 if unknown
1577 */
1578
1579 static int
1580 check_posix_name(const uschar *ptr, int len)
1581 {
1582 register int yield = 0;
1583 while (posix_name_lengths[yield] != 0)
1584   {
1585   if (len == posix_name_lengths[yield] &&
1586     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1587   yield++;
1588   }
1589 return -1;
1590 }
1591
1592
1593 /*************************************************
1594 *    Adjust OP_RECURSE items in repeated group   *
1595 *************************************************/
1596
1597 /* OP_RECURSE items contain an offset from the start of the regex to the group
1598 that is referenced. This means that groups can be replicated for fixed
1599 repetition simply by copying (because the recursion is allowed to refer to
1600 earlier groups that are outside the current group). However, when a group is
1601 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1602 it, after it has been compiled. This means that any OP_RECURSE items within it
1603 that refer to the group itself or any contained groups have to have their
1604 offsets adjusted. That one of the jobs of this function. Before it is called,
1605 the partially compiled regex must be temporarily terminated with OP_END.
1606
1607 This function has been extended with the possibility of forward references for
1608 recursions and subroutine calls. It must also check the list of such references
1609 for the group we are dealing with. If it finds that one of the recursions in
1610 the current group is on this list, it adjusts the offset in the list, not the
1611 value in the reference (which is a group number).
1612
1613 Arguments:
1614   group      points to the start of the group
1615   adjust     the amount by which the group is to be moved
1616   utf8       TRUE in UTF-8 mode
1617   cd         contains pointers to tables etc.
1618   save_hwm   the hwm forward reference pointer at the start of the group
1619
1620 Returns:     nothing
1621 */
1622
1623 static void
1624 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1625   uschar *save_hwm)
1626 {
1627 uschar *ptr = group;
1628 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1629   {
1630   int offset;
1631   uschar *hc;
1632
1633   /* See if this recursion is on the forward reference list. If so, adjust the
1634   reference. */
1635
1636   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1637     {
1638     offset = GET(hc, 0);
1639     if (cd->start_code + offset == ptr + 1)
1640       {
1641       PUT(hc, 0, offset + adjust);
1642       break;
1643       }
1644     }
1645
1646   /* Otherwise, adjust the recursion offset if it's after the start of this
1647   group. */
1648
1649   if (hc >= cd->hwm)
1650     {
1651     offset = GET(ptr, 1);
1652     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1653     }
1654
1655   ptr += 1 + LINK_SIZE;
1656   }
1657 }
1658
1659
1660
1661 /*************************************************
1662 *        Insert an automatic callout point       *
1663 *************************************************/
1664
1665 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1666 callout points before each pattern item.
1667
1668 Arguments:
1669   code           current code pointer
1670   ptr            current pattern pointer
1671   cd             pointers to tables etc
1672
1673 Returns:         new code pointer
1674 */
1675
1676 static uschar *
1677 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1678 {
1679 *code++ = OP_CALLOUT;
1680 *code++ = 255;
1681 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1682 PUT(code, LINK_SIZE, 0);                /* Default length */
1683 return code + 2*LINK_SIZE;
1684 }
1685
1686
1687
1688 /*************************************************
1689 *         Complete a callout item                *
1690 *************************************************/
1691
1692 /* A callout item contains the length of the next item in the pattern, which
1693 we can't fill in till after we have reached the relevant point. This is used
1694 for both automatic and manual callouts.
1695
1696 Arguments:
1697   previous_callout   points to previous callout item
1698   ptr                current pattern pointer
1699   cd                 pointers to tables etc
1700
1701 Returns:             nothing
1702 */
1703
1704 static void
1705 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1706 {
1707 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1708 PUT(previous_callout, 2 + LINK_SIZE, length);
1709 }
1710
1711
1712
1713 #ifdef SUPPORT_UCP
1714 /*************************************************
1715 *           Get othercase range                  *
1716 *************************************************/
1717
1718 /* This function is passed the start and end of a class range, in UTF-8 mode
1719 with UCP support. It searches up the characters, looking for internal ranges of
1720 characters in the "other" case. Each call returns the next one, updating the
1721 start address.
1722
1723 Arguments:
1724   cptr        points to starting character value; updated
1725   d           end value
1726   ocptr       where to put start of othercase range
1727   odptr       where to put end of othercase range
1728
1729 Yield:        TRUE when range returned; FALSE when no more
1730 */
1731
1732 static BOOL
1733 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1734   unsigned int *odptr)
1735 {
1736 unsigned int c, othercase, next;
1737
1738 for (c = *cptr; c <= d; c++)
1739   { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1740
1741 if (c > d) return FALSE;
1742
1743 *ocptr = othercase;
1744 next = othercase + 1;
1745
1746 for (++c; c <= d; c++)
1747   {
1748   if (_pcre_ucp_othercase(c) != next) break;
1749   next++;
1750   }
1751
1752 *odptr = next - 1;
1753 *cptr = c;
1754
1755 return TRUE;
1756 }
1757 #endif  /* SUPPORT_UCP */
1758
1759
1760
1761 /*************************************************
1762 *     Check if auto-possessifying is possible    *
1763 *************************************************/
1764
1765 /* This function is called for unlimited repeats of certain items, to see
1766 whether the next thing could possibly match the repeated item. If not, it makes
1767 sense to automatically possessify the repeated item.
1768
1769 Arguments:
1770   op_code       the repeated op code
1771   this          data for this item, depends on the opcode
1772   utf8          TRUE in UTF-8 mode
1773   utf8_char     used for utf8 character bytes, NULL if not relevant
1774   ptr           next character in pattern
1775   options       options bits
1776   cd            contains pointers to tables etc.
1777
1778 Returns:        TRUE if possessifying is wanted
1779 */
1780
1781 static BOOL
1782 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1783   const uschar *ptr, int options, compile_data *cd)
1784 {
1785 int next;
1786
1787 /* Skip whitespace and comments in extended mode */
1788
1789 if ((options & PCRE_EXTENDED) != 0)
1790   {
1791   for (;;)
1792     {
1793     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1794     if (*ptr == '#')
1795       {
1796       while (*(++ptr) != 0)
1797         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1798       }
1799     else break;
1800     }
1801   }
1802
1803 /* If the next item is one that we can handle, get its value. A non-negative
1804 value is a character, a negative value is an escape value. */
1805
1806 if (*ptr == '\\')
1807   {
1808   int temperrorcode = 0;
1809   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1810   if (temperrorcode != 0) return FALSE;
1811   ptr++;    /* Point after the escape sequence */
1812   }
1813
1814 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1815   {
1816 #ifdef SUPPORT_UTF8
1817   if (utf8) { GETCHARINC(next, ptr); } else
1818 #endif
1819   next = *ptr++;
1820   }
1821
1822 else return FALSE;
1823
1824 /* Skip whitespace and comments in extended mode */
1825
1826 if ((options & PCRE_EXTENDED) != 0)
1827   {
1828   for (;;)
1829     {
1830     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1831     if (*ptr == '#')
1832       {
1833       while (*(++ptr) != 0)
1834         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1835       }
1836     else break;
1837     }
1838   }
1839
1840 /* If the next thing is itself optional, we have to give up. */
1841
1842 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1843   return FALSE;
1844
1845 /* Now compare the next item with the previous opcode. If the previous is a
1846 positive single character match, "item" either contains the character or, if
1847 "item" is greater than 127 in utf8 mode, the character's bytes are in
1848 utf8_char. */
1849
1850
1851 /* Handle cases when the next item is a character. */
1852
1853 if (next >= 0) switch(op_code)
1854   {
1855   case OP_CHAR:
1856 #ifdef SUPPORT_UTF8
1857   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1858 #endif
1859   return item != next;
1860
1861   /* For CHARNC (caseless character) we must check the other case. If we have
1862   Unicode property support, we can use it to test the other case of
1863   high-valued characters. */
1864
1865   case OP_CHARNC:
1866 #ifdef SUPPORT_UTF8
1867   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1868 #endif
1869   if (item == next) return FALSE;
1870 #ifdef SUPPORT_UTF8
1871   if (utf8)
1872     {
1873     unsigned int othercase;
1874     if (next < 128) othercase = cd->fcc[next]; else
1875 #ifdef SUPPORT_UCP
1876     othercase = _pcre_ucp_othercase((unsigned int)next);
1877 #else
1878     othercase = NOTACHAR;
1879 #endif
1880     return (unsigned int)item != othercase;
1881     }
1882   else
1883 #endif  /* SUPPORT_UTF8 */
1884   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
1885
1886   /* For OP_NOT, "item" must be a single-byte character. */
1887
1888   case OP_NOT:
1889   if (next < 0) return FALSE;  /* Not a character */
1890   if (item == next) return TRUE;
1891   if ((options & PCRE_CASELESS) == 0) return FALSE;
1892 #ifdef SUPPORT_UTF8
1893   if (utf8)
1894     {
1895     unsigned int othercase;
1896     if (next < 128) othercase = cd->fcc[next]; else
1897 #ifdef SUPPORT_UCP
1898     othercase = _pcre_ucp_othercase(next);
1899 #else
1900     othercase = NOTACHAR;
1901 #endif
1902     return (unsigned int)item == othercase;
1903     }
1904   else
1905 #endif  /* SUPPORT_UTF8 */
1906   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
1907
1908   case OP_DIGIT:
1909   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1910
1911   case OP_NOT_DIGIT:
1912   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1913
1914   case OP_WHITESPACE:
1915   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1916
1917   case OP_NOT_WHITESPACE:
1918   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1919
1920   case OP_WORDCHAR:
1921   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1922
1923   case OP_NOT_WORDCHAR:
1924   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1925
1926   default:
1927   return FALSE;
1928   }
1929
1930
1931 /* Handle the case when the next item is \d, \s, etc. */
1932
1933 switch(op_code)
1934   {
1935   case OP_CHAR:
1936   case OP_CHARNC:
1937 #ifdef SUPPORT_UTF8
1938   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1939 #endif
1940   switch(-next)
1941     {
1942     case ESC_d:
1943     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1944
1945     case ESC_D:
1946     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1947
1948     case ESC_s:
1949     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1950
1951     case ESC_S:
1952     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1953
1954     case ESC_w:
1955     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1956
1957     case ESC_W:
1958     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1959
1960     default:
1961     return FALSE;
1962     }
1963
1964   case OP_DIGIT:
1965   return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1966
1967   case OP_NOT_DIGIT:
1968   return next == -ESC_d;
1969
1970   case OP_WHITESPACE:
1971   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1972
1973   case OP_NOT_WHITESPACE:
1974   return next == -ESC_s;
1975
1976   case OP_WORDCHAR:
1977   return next == -ESC_W || next == -ESC_s;
1978
1979   case OP_NOT_WORDCHAR:
1980   return next == -ESC_w || next == -ESC_d;
1981
1982   default:
1983   return FALSE;
1984   }
1985
1986 /* Control does not reach here */
1987 }
1988
1989
1990
1991 /*************************************************
1992 *           Compile one branch                   *
1993 *************************************************/
1994
1995 /* Scan the pattern, compiling it into the a vector. If the options are
1996 changed during the branch, the pointer is used to change the external options
1997 bits. This function is used during the pre-compile phase when we are trying
1998 to find out the amount of memory needed, as well as during the real compile
1999 phase. The value of lengthptr distinguishes the two phases.
2000
2001 Arguments:
2002   optionsptr     pointer to the option bits
2003   codeptr        points to the pointer to the current code point
2004   ptrptr         points to the current pattern pointer
2005   errorcodeptr   points to error code variable
2006   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2007   reqbyteptr     set to the last literal character required, else < 0
2008   bcptr          points to current branch chain
2009   cd             contains pointers to tables etc.
2010   lengthptr      NULL during the real compile phase
2011                  points to length accumulator during pre-compile phase
2012
2013 Returns:         TRUE on success
2014                  FALSE, with *errorcodeptr set non-zero on error
2015 */
2016
2017 static BOOL
2018 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2019   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2020   compile_data *cd, int *lengthptr)
2021 {
2022 int repeat_type, op_type;
2023 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2024 int bravalue = 0;
2025 int greedy_default, greedy_non_default;
2026 int firstbyte, reqbyte;
2027 int zeroreqbyte, zerofirstbyte;
2028 int req_caseopt, reqvary, tempreqvary;
2029 int options = *optionsptr;
2030 int after_manual_callout = 0;
2031 int length_prevgroup = 0;
2032 register int c;
2033 register uschar *code = *codeptr;
2034 uschar *last_code = code;
2035 uschar *orig_code = code;
2036 uschar *tempcode;
2037 BOOL inescq = FALSE;
2038 BOOL groupsetfirstbyte = FALSE;
2039 const uschar *ptr = *ptrptr;
2040 const uschar *tempptr;
2041 uschar *previous = NULL;
2042 uschar *previous_callout = NULL;
2043 uschar *save_hwm = NULL;
2044 uschar classbits[32];
2045
2046 #ifdef SUPPORT_UTF8
2047 BOOL class_utf8;
2048 BOOL utf8 = (options & PCRE_UTF8) != 0;
2049 uschar *class_utf8data;
2050 uschar utf8_char[6];
2051 #else
2052 BOOL utf8 = FALSE;
2053 uschar *utf8_char = NULL;
2054 #endif
2055
2056 #ifdef DEBUG
2057 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2058 #endif
2059
2060 /* Set up the default and non-default settings for greediness */
2061
2062 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2063 greedy_non_default = greedy_default ^ 1;
2064
2065 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2066 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2067 matches a non-fixed char first char; reqbyte just remains unset if we never
2068 find one.
2069
2070 When we hit a repeat whose minimum is zero, we may have to adjust these values
2071 to take the zero repeat into account. This is implemented by setting them to
2072 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2073 item types that can be repeated set these backoff variables appropriately. */
2074
2075 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2076
2077 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2078 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2079 value > 255. It is added into the firstbyte or reqbyte variables to record the
2080 case status of the value. This is used only for ASCII characters. */
2081
2082 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2083
2084 /* Switch on next character until the end of the branch */
2085
2086 for (;; ptr++)
2087   {
2088   BOOL negate_class;
2089   BOOL possessive_quantifier;
2090   BOOL is_quantifier;
2091   BOOL is_recurse;
2092   int class_charcount;
2093   int class_lastchar;
2094   int newoptions;
2095   int recno;
2096   int skipbytes;
2097   int subreqbyte;
2098   int subfirstbyte;
2099   int terminator;
2100   int mclength;
2101   uschar mcbuffer[8];
2102
2103   /* Get next byte in the pattern */
2104
2105   c = *ptr;
2106
2107   /* If we are in the pre-compile phase, accumulate the length used for the
2108   previous cycle of this loop. */
2109
2110   if (lengthptr != NULL)
2111     {
2112 #ifdef DEBUG
2113     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2114 #endif
2115     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2116       {
2117       *errorcodeptr = ERR52;
2118       goto FAILED;
2119       }
2120
2121     /* There is at least one situation where code goes backwards: this is the
2122     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2123     the class is simply eliminated. However, it is created first, so we have to
2124     allow memory for it. Therefore, don't ever reduce the length at this point.
2125     */
2126
2127     if (code < last_code) code = last_code;
2128     *lengthptr += code - last_code;
2129     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2130
2131     /* If "previous" is set and it is not at the start of the work space, move
2132     it back to there, in order to avoid filling up the work space. Otherwise,
2133     if "previous" is NULL, reset the current code pointer to the start. */
2134
2135     if (previous != NULL)
2136       {
2137       if (previous > orig_code)
2138         {
2139         memmove(orig_code, previous, code - previous);
2140         code -= previous - orig_code;
2141         previous = orig_code;
2142         }
2143       }
2144     else code = orig_code;
2145
2146     /* Remember where this code item starts so we can pick up the length
2147     next time round. */
2148
2149     last_code = code;
2150     }
2151
2152   /* In the real compile phase, just check the workspace used by the forward
2153   reference list. */
2154
2155   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2156     {
2157     *errorcodeptr = ERR52;
2158     goto FAILED;
2159     }
2160
2161   /* If in \Q...\E, check for the end; if not, we have a literal */
2162
2163   if (inescq && c != 0)
2164     {
2165     if (c == '\\' && ptr[1] == 'E')
2166       {
2167       inescq = FALSE;
2168       ptr++;
2169       continue;
2170       }
2171     else
2172       {
2173       if (previous_callout != NULL)
2174         {
2175         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2176           complete_callout(previous_callout, ptr, cd);
2177         previous_callout = NULL;
2178         }
2179       if ((options & PCRE_AUTO_CALLOUT) != 0)
2180         {
2181         previous_callout = code;
2182         code = auto_callout(code, ptr, cd);
2183         }
2184       goto NORMAL_CHAR;
2185       }
2186     }
2187
2188   /* Fill in length of a previous callout, except when the next thing is
2189   a quantifier. */
2190
2191   is_quantifier = c == '*' || c == '+' || c == '?' ||
2192     (c == '{' && is_counted_repeat(ptr+1));
2193
2194   if (!is_quantifier && previous_callout != NULL &&
2195        after_manual_callout-- <= 0)
2196     {
2197     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2198       complete_callout(previous_callout, ptr, cd);
2199     previous_callout = NULL;
2200     }
2201
2202   /* In extended mode, skip white space and comments */
2203
2204   if ((options & PCRE_EXTENDED) != 0)
2205     {
2206     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2207     if (c == '#')
2208       {
2209       while (*(++ptr) != 0)
2210         {
2211         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2212         }
2213       if (*ptr != 0) continue;
2214
2215       /* Else fall through to handle end of string */
2216       c = 0;
2217       }
2218     }
2219
2220   /* No auto callout for quantifiers. */
2221
2222   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2223     {
2224     previous_callout = code;
2225     code = auto_callout(code, ptr, cd);
2226     }
2227
2228   switch(c)
2229     {
2230     /* ===================================================================*/
2231     case 0:                        /* The branch terminates at string end */
2232     case '|':                      /* or | or ) */
2233     case ')':
2234     *firstbyteptr = firstbyte;
2235     *reqbyteptr = reqbyte;
2236     *codeptr = code;
2237     *ptrptr = ptr;
2238     if (lengthptr != NULL)
2239       {
2240       *lengthptr += code - last_code;   /* To include callout length */
2241       DPRINTF((">> end branch\n"));
2242       }
2243     return TRUE;
2244
2245
2246     /* ===================================================================*/
2247     /* Handle single-character metacharacters. In multiline mode, ^ disables
2248     the setting of any following char as a first character. */
2249
2250     case '^':
2251     if ((options & PCRE_MULTILINE) != 0)
2252       {
2253       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2254       }
2255     previous = NULL;
2256     *code++ = OP_CIRC;
2257     break;
2258
2259     case '$':
2260     previous = NULL;
2261     *code++ = OP_DOLL;
2262     break;
2263
2264     /* There can never be a first char if '.' is first, whatever happens about
2265     repeats. The value of reqbyte doesn't change either. */
2266
2267     case '.':
2268     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2269     zerofirstbyte = firstbyte;
2270     zeroreqbyte = reqbyte;
2271     previous = code;
2272     *code++ = OP_ANY;
2273     break;
2274
2275
2276     /* ===================================================================*/
2277     /* Character classes. If the included characters are all < 256, we build a
2278     32-byte bitmap of the permitted characters, except in the special case
2279     where there is only one such character. For negated classes, we build the
2280     map as usual, then invert it at the end. However, we use a different opcode
2281     so that data characters > 255 can be handled correctly.
2282
2283     If the class contains characters outside the 0-255 range, a different
2284     opcode is compiled. It may optionally have a bit map for characters < 256,
2285     but those above are are explicitly listed afterwards. A flag byte tells
2286     whether the bitmap is present, and whether this is a negated class or not.
2287     */
2288
2289     case '[':
2290     previous = code;
2291
2292     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2293     they are encountered at the top level, so we'll do that too. */
2294
2295     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2296         check_posix_syntax(ptr, &tempptr, cd))
2297       {
2298       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2299       goto FAILED;
2300       }
2301
2302     /* If the first character is '^', set the negation flag and skip it. */
2303
2304     if ((c = *(++ptr)) == '^')
2305       {
2306       negate_class = TRUE;
2307       c = *(++ptr);
2308       }
2309     else
2310       {
2311       negate_class = FALSE;
2312       }
2313
2314     /* Keep a count of chars with values < 256 so that we can optimize the case
2315     of just a single character (as long as it's < 256). However, For higher
2316     valued UTF-8 characters, we don't yet do any optimization. */
2317
2318     class_charcount = 0;
2319     class_lastchar = -1;
2320
2321     /* Initialize the 32-char bit map to all zeros. We build the map in a
2322     temporary bit of memory, in case the class contains only 1 character (less
2323     than 256), because in that case the compiled code doesn't use the bit map.
2324     */
2325
2326     memset(classbits, 0, 32 * sizeof(uschar));
2327
2328 #ifdef SUPPORT_UTF8
2329     class_utf8 = FALSE;                       /* No chars >= 256 */
2330     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2331 #endif
2332
2333     /* Process characters until ] is reached. By writing this as a "do" it
2334     means that an initial ] is taken as a data character. At the start of the
2335     loop, c contains the first byte of the character. */
2336
2337     if (c != 0) do
2338       {
2339       const uschar *oldptr;
2340
2341 #ifdef SUPPORT_UTF8
2342       if (utf8 && c > 127)
2343         {                           /* Braces are required because the */
2344         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2345         }
2346 #endif
2347
2348       /* Inside \Q...\E everything is literal except \E */
2349
2350       if (inescq)
2351         {
2352         if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2353           {
2354           inescq = FALSE;                   /* Reset literal state */
2355           ptr++;                            /* Skip the 'E' */
2356           continue;                         /* Carry on with next */
2357           }
2358         goto CHECK_RANGE;                   /* Could be range if \E follows */
2359         }
2360
2361       /* Handle POSIX class names. Perl allows a negation extension of the
2362       form [:^name:]. A square bracket that doesn't match the syntax is
2363       treated as a literal. We also recognize the POSIX constructions
2364       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2365       5.6 and 5.8 do. */
2366
2367       if (c == '[' &&
2368           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2369           check_posix_syntax(ptr, &tempptr, cd))
2370         {
2371         BOOL local_negate = FALSE;
2372         int posix_class, taboffset, tabopt;
2373         register const uschar *cbits = cd->cbits;
2374         uschar pbits[32];
2375
2376         if (ptr[1] != ':')
2377           {
2378           *errorcodeptr = ERR31;
2379           goto FAILED;
2380           }
2381
2382         ptr += 2;
2383         if (*ptr == '^')
2384           {
2385           local_negate = TRUE;
2386           ptr++;
2387           }
2388
2389         posix_class = check_posix_name(ptr, tempptr - ptr);
2390         if (posix_class < 0)
2391           {
2392           *errorcodeptr = ERR30;
2393           goto FAILED;
2394           }
2395
2396         /* If matching is caseless, upper and lower are converted to
2397         alpha. This relies on the fact that the class table starts with
2398         alpha, lower, upper as the first 3 entries. */
2399
2400         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2401           posix_class = 0;
2402
2403         /* We build the bit map for the POSIX class in a chunk of local store
2404         because we may be adding and subtracting from it, and we don't want to
2405         subtract bits that may be in the main map already. At the end we or the
2406         result into the bit map that is being built. */
2407
2408         posix_class *= 3;
2409
2410         /* Copy in the first table (always present) */
2411
2412         memcpy(pbits, cbits + posix_class_maps[posix_class],
2413           32 * sizeof(uschar));
2414
2415         /* If there is a second table, add or remove it as required. */
2416
2417         taboffset = posix_class_maps[posix_class + 1];
2418         tabopt = posix_class_maps[posix_class + 2];
2419
2420         if (taboffset >= 0)
2421           {
2422           if (tabopt >= 0)
2423             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2424           else
2425             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2426           }
2427
2428         /* Not see if we need to remove any special characters. An option
2429         value of 1 removes vertical space and 2 removes underscore. */
2430
2431         if (tabopt < 0) tabopt = -tabopt;
2432         if (tabopt == 1) pbits[1] &= ~0x3c;
2433           else if (tabopt == 2) pbits[11] &= 0x7f;
2434
2435         /* Add the POSIX table or its complement into the main table that is
2436         being built and we are done. */
2437
2438         if (local_negate)
2439           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2440         else
2441           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2442
2443         ptr = tempptr + 1;
2444         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2445         continue;    /* End of POSIX syntax handling */
2446         }
2447
2448       /* Backslash may introduce a single character, or it may introduce one
2449       of the specials, which just set a flag. The sequence \b is a special
2450       case. Inside a class (and only there) it is treated as backspace.
2451       Elsewhere it marks a word boundary. Other escapes have preset maps ready
2452       to or into the one we are building. We assume they have more than one
2453       character in them, so set class_charcount bigger than one. */
2454
2455       if (c == '\\')
2456         {
2457         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2458         if (*errorcodeptr != 0) goto FAILED;
2459
2460         if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2461         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2462         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2463         else if (-c == ESC_Q)            /* Handle start of quoted string */
2464           {
2465           if (ptr[1] == '\\' && ptr[2] == 'E')
2466             {
2467             ptr += 2; /* avoid empty string */
2468             }
2469           else inescq = TRUE;
2470           continue;
2471           }
2472
2473         if (c < 0)
2474           {
2475           register const uschar *cbits = cd->cbits;
2476           class_charcount += 2;     /* Greater than 1 is what matters */
2477
2478           /* Save time by not doing this in the pre-compile phase. */
2479
2480           if (lengthptr == NULL) switch (-c)
2481             {
2482             case ESC_d:
2483             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2484             continue;
2485
2486             case ESC_D:
2487             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2488             continue;
2489
2490             case ESC_w:
2491             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2492             continue;
2493
2494             case ESC_W:
2495             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2496             continue;
2497
2498             case ESC_s:
2499             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2500             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2501             continue;
2502
2503             case ESC_S:
2504             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2505             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2506             continue;
2507
2508             case ESC_E: /* Perl ignores an orphan \E */
2509             continue;
2510
2511             default:    /* Not recognized; fall through */
2512             break;      /* Need "default" setting to stop compiler warning. */
2513             }
2514
2515           /* In the pre-compile phase, just do the recognition. */
2516
2517           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2518                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2519
2520           /* We need to deal with \P and \p in both phases. */
2521
2522 #ifdef SUPPORT_UCP
2523           if (-c == ESC_p || -c == ESC_P)
2524             {
2525             BOOL negated;
2526             int pdata;
2527             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2528             if (ptype < 0) goto FAILED;
2529             class_utf8 = TRUE;
2530             *class_utf8data++ = ((-c == ESC_p) != negated)?
2531               XCL_PROP : XCL_NOTPROP;
2532             *class_utf8data++ = ptype;
2533             *class_utf8data++ = pdata;
2534             class_charcount -= 2;   /* Not a < 256 character */
2535             continue;
2536             }
2537 #endif
2538           /* Unrecognized escapes are faulted if PCRE is running in its
2539           strict mode. By default, for compatibility with Perl, they are
2540           treated as literals. */
2541
2542           if ((options & PCRE_EXTRA) != 0)
2543             {
2544             *errorcodeptr = ERR7;
2545             goto FAILED;
2546             }
2547
2548           class_charcount -= 2;  /* Undo the default count from above */
2549           c = *ptr;              /* Get the final character and fall through */
2550           }
2551
2552         /* Fall through if we have a single character (c >= 0). This may be
2553         greater than 256 in UTF-8 mode. */
2554
2555         }   /* End of backslash handling */
2556
2557       /* A single character may be followed by '-' to form a range. However,
2558       Perl does not permit ']' to be the end of the range. A '-' character
2559       at the end is treated as a literal. Perl ignores orphaned \E sequences
2560       entirely. The code for handling \Q and \E is messy. */
2561
2562       CHECK_RANGE:
2563       while (ptr[1] == '\\' && ptr[2] == 'E')
2564         {
2565         inescq = FALSE;
2566         ptr += 2;
2567         }
2568
2569       oldptr = ptr;
2570
2571       if (!inescq && ptr[1] == '-')
2572         {
2573         int d;
2574         ptr += 2;
2575         while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2576
2577         /* If we hit \Q (not followed by \E) at this point, go into escaped
2578         mode. */
2579
2580         while (*ptr == '\\' && ptr[1] == 'Q')
2581           {
2582           ptr += 2;
2583           if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2584           inescq = TRUE;
2585           break;
2586           }
2587
2588         if (*ptr == 0 || (!inescq && *ptr == ']'))
2589           {
2590           ptr = oldptr;
2591           goto LONE_SINGLE_CHARACTER;
2592           }
2593
2594 #ifdef SUPPORT_UTF8
2595         if (utf8)
2596           {                           /* Braces are required because the */
2597           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
2598           }
2599         else
2600 #endif
2601         d = *ptr;  /* Not UTF-8 mode */
2602
2603         /* The second part of a range can be a single-character escape, but
2604         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2605         in such circumstances. */
2606
2607         if (!inescq && d == '\\')
2608           {
2609           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2610           if (*errorcodeptr != 0) goto FAILED;
2611
2612           /* \b is backslash; \X is literal X; \R is literal R; any other
2613           special means the '-' was literal */
2614
2615           if (d < 0)
2616             {
2617             if (d == -ESC_b) d = '\b';
2618             else if (d == -ESC_X) d = 'X';
2619             else if (d == -ESC_R) d = 'R'; else
2620               {
2621               ptr = oldptr;
2622               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2623               }
2624             }
2625           }
2626
2627         /* Check that the two values are in the correct order. Optimize
2628         one-character ranges */
2629
2630         if (d < c)
2631           {
2632           *errorcodeptr = ERR8;
2633           goto FAILED;
2634           }
2635
2636         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
2637
2638         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2639         matching, we have to use an XCLASS with extra data items. Caseless
2640         matching for characters > 127 is available only if UCP support is
2641         available. */
2642
2643 #ifdef SUPPORT_UTF8
2644         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2645           {
2646           class_utf8 = TRUE;
2647
2648           /* With UCP support, we can find the other case equivalents of
2649           the relevant characters. There may be several ranges. Optimize how
2650           they fit with the basic range. */
2651
2652 #ifdef SUPPORT_UCP
2653           if ((options & PCRE_CASELESS) != 0)
2654             {
2655             unsigned int occ, ocd;
2656             unsigned int cc = c;
2657             unsigned int origd = d;
2658             while (get_othercase_range(&cc, origd, &occ, &ocd))
2659               {
2660               if (occ >= c && ocd <= d) continue;  /* Skip embedded ranges */
2661
2662               if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
2663                 {                                  /* if there is overlap,   */
2664                 c = occ;                           /* noting that if occ < c */
2665                 continue;                          /* we can't have ocd > d  */
2666                 }                                  /* because a subrange is  */
2667               if (ocd > d && occ <= d + 1)         /* always shorter than    */
2668                 {                                  /* the basic range.       */
2669                 d = ocd;
2670                 continue;
2671                 }
2672
2673               if (occ == ocd)
2674                 {
2675                 *class_utf8data++ = XCL_SINGLE;
2676                 }
2677               else
2678                 {
2679                 *class_utf8data++ = XCL_RANGE;
2680                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2681                 }
2682               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2683               }
2684             }
2685 #endif  /* SUPPORT_UCP */
2686
2687           /* Now record the original range, possibly modified for UCP caseless
2688           overlapping ranges. */
2689
2690           *class_utf8data++ = XCL_RANGE;
2691           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2692           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2693
2694           /* With UCP support, we are done. Without UCP support, there is no
2695           caseless matching for UTF-8 characters > 127; we can use the bit map
2696           for the smaller ones. */
2697
2698 #ifdef SUPPORT_UCP
2699           continue;    /* With next character in the class */
2700 #else
2701           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2702
2703           /* Adjust upper limit and fall through to set up the map */
2704
2705           d = 127;
2706
2707 #endif  /* SUPPORT_UCP */
2708           }
2709 #endif  /* SUPPORT_UTF8 */
2710
2711         /* We use the bit map for all cases when not in UTF-8 mode; else
2712         ranges that lie entirely within 0-127 when there is UCP support; else
2713         for partial ranges without UCP support. */
2714
2715         class_charcount += d - c + 1;
2716         class_lastchar = d;
2717
2718         /* We can save a bit of time by skipping this in the pre-compile. */
2719
2720         if (lengthptr == NULL) for (; c <= d; c++)
2721           {
2722           classbits[c/8] |= (1 << (c&7));
2723           if ((options & PCRE_CASELESS) != 0)
2724             {
2725             int uc = cd->fcc[c];           /* flip case */
2726             classbits[uc/8] |= (1 << (uc&7));
2727             }
2728           }
2729
2730         continue;   /* Go get the next char in the class */
2731         }
2732
2733       /* Handle a lone single character - we can get here for a normal
2734       non-escape char, or after \ that introduces a single character or for an
2735       apparent range that isn't. */
2736
2737       LONE_SINGLE_CHARACTER:
2738
2739       /* Handle a character that cannot go in the bit map */
2740
2741 #ifdef SUPPORT_UTF8
2742       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2743         {
2744         class_utf8 = TRUE;
2745         *class_utf8data++ = XCL_SINGLE;
2746         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2747
2748 #ifdef SUPPORT_UCP
2749         if ((options & PCRE_CASELESS) != 0)
2750           {
2751           unsigned int othercase;
2752           if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2753             {
2754             *class_utf8data++ = XCL_SINGLE;
2755             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2756             }
2757           }
2758 #endif  /* SUPPORT_UCP */
2759
2760         }
2761       else
2762 #endif  /* SUPPORT_UTF8 */
2763
2764       /* Handle a single-byte character */
2765         {
2766         classbits[c/8] |= (1 << (c&7));
2767         if ((options & PCRE_CASELESS) != 0)
2768           {
2769           c = cd->fcc[c];   /* flip case */
2770           classbits[c/8] |= (1 << (c&7));
2771           }
2772         class_charcount++;
2773         class_lastchar = c;
2774         }
2775       }
2776
2777     /* Loop until ']' reached. This "while" is the end of the "do" above. */
2778
2779     while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2780
2781     if (c == 0)                          /* Missing terminating ']' */
2782       {
2783       *errorcodeptr = ERR6;
2784       goto FAILED;
2785       }
2786
2787     /* If class_charcount is 1, we saw precisely one character whose value is
2788     less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2789     can optimize the negative case only if there were no characters >= 128
2790     because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2791     single-bytes only. This is an historical hangover. Maybe one day we can
2792     tidy these opcodes to handle multi-byte characters.
2793
2794     The optimization throws away the bit map. We turn the item into a
2795     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2796     that OP_NOT does not support multibyte characters. In the positive case, it
2797     can cause firstbyte to be set. Otherwise, there can be no first char if
2798     this item is first, whatever repeat count may follow. In the case of
2799     reqbyte, save the previous value for reinstating. */
2800
2801 #ifdef SUPPORT_UTF8
2802     if (class_charcount == 1 &&
2803           (!utf8 ||
2804           (!class_utf8 && (!negate_class || class_lastchar < 128))))
2805
2806 #else
2807     if (class_charcount == 1)
2808 #endif
2809       {
2810       zeroreqbyte = reqbyte;
2811
2812       /* The OP_NOT opcode works on one-byte characters only. */
2813
2814       if (negate_class)
2815         {
2816         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2817         zerofirstbyte = firstbyte;
2818         *code++ = OP_NOT;
2819         *code++ = class_lastchar;
2820         break;
2821         }
2822
2823       /* For a single, positive character, get the value into mcbuffer, and
2824       then we can handle this with the normal one-character code. */
2825
2826 #ifdef SUPPORT_UTF8
2827       if (utf8 && class_lastchar > 127)
2828         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2829       else
2830 #endif
2831         {
2832         mcbuffer[0] = class_lastchar;
2833         mclength = 1;
2834         }
2835       goto ONE_CHAR;
2836       }       /* End of 1-char optimization */
2837
2838     /* The general case - not the one-char optimization. If this is the first
2839     thing in the branch, there can be no first char setting, whatever the
2840     repeat count. Any reqbyte setting must remain unchanged after any kind of
2841     repeat. */
2842
2843     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2844     zerofirstbyte = firstbyte;
2845     zeroreqbyte = reqbyte;
2846
2847     /* If there are characters with values > 255, we have to compile an
2848     extended class, with its own opcode. If there are no characters < 256,
2849     we can omit the bitmap in the actual compiled code. */
2850
2851 #ifdef SUPPORT_UTF8
2852     if (class_utf8)
2853       {
2854       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
2855       *code++ = OP_XCLASS;
2856       code += LINK_SIZE;
2857       *code = negate_class? XCL_NOT : 0;
2858
2859       /* If the map is required, move up the extra data to make room for it;
2860       otherwise just move the code pointer to the end of the extra data. */
2861
2862       if (class_charcount > 0)
2863         {
2864         *code++ |= XCL_MAP;
2865         memmove(code + 32, code, class_utf8data - code);
2866         memcpy(code, classbits, 32);
2867         code = class_utf8data + 32;
2868         }
2869       else code = class_utf8data;
2870
2871       /* Now fill in the complete length of the item */
2872
2873       PUT(previous, 1, code - previous);
2874       break;   /* End of class handling */
2875       }
2876 #endif
2877
2878     /* If there are no characters > 255, negate the 32-byte map if necessary,
2879     and copy it into the code vector. If this is the first thing in the branch,
2880     there can be no first char setting, whatever the repeat count. Any reqbyte
2881     setting must remain unchanged after any kind of repeat. */
2882
2883     if (negate_class)
2884       {
2885       *code++ = OP_NCLASS;
2886       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
2887         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2888       }
2889     else
2890       {
2891       *code++ = OP_CLASS;
2892       memcpy(code, classbits, 32);
2893       }
2894     code += 32;
2895     break;
2896
2897
2898     /* ===================================================================*/
2899     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2900     has been tested above. */
2901
2902     case '{':
2903     if (!is_quantifier) goto NORMAL_CHAR;
2904     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2905     if (*errorcodeptr != 0) goto FAILED;
2906     goto REPEAT;
2907
2908     case '*':
2909     repeat_min = 0;
2910     repeat_max = -1;
2911     goto REPEAT;
2912
2913     case '+':
2914     repeat_min = 1;
2915     repeat_max = -1;
2916     goto REPEAT;
2917
2918     case '?':
2919     repeat_min = 0;
2920     repeat_max = 1;
2921
2922     REPEAT:
2923     if (previous == NULL)
2924       {
2925       *errorcodeptr = ERR9;
2926       goto FAILED;
2927       }
2928
2929     if (repeat_min == 0)
2930       {
2931       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
2932       reqbyte = zeroreqbyte;        /* Ditto */
2933       }
2934
2935     /* Remember whether this is a variable length repeat */
2936
2937     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2938
2939     op_type = 0;                    /* Default single-char op codes */
2940     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
2941
2942     /* Save start of previous item, in case we have to move it up to make space
2943     for an inserted OP_ONCE for the additional '+' extension. */
2944
2945     tempcode = previous;
2946
2947     /* If the next character is '+', we have a possessive quantifier. This
2948     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2949     If the next character is '?' this is a minimizing repeat, by default,
2950     but if PCRE_UNGREEDY is set, it works the other way round. We change the
2951     repeat type to the non-default. */
2952
2953     if (ptr[1] == '+')
2954       {
2955       repeat_type = 0;                  /* Force greedy */
2956       possessive_quantifier = TRUE;
2957       ptr++;
2958       }
2959     else if (ptr[1] == '?')
2960       {
2961       repeat_type = greedy_non_default;
2962       ptr++;
2963       }
2964     else repeat_type = greedy_default;
2965
2966     /* If previous was a character match, abolish the item and generate a
2967     repeat item instead. If a char item has a minumum of more than one, ensure
2968     that it is set in reqbyte - it might not be if a sequence such as x{3} is
2969     the first thing in a branch because the x will have gone into firstbyte
2970     instead.  */
2971
2972     if (*previous == OP_CHAR || *previous == OP_CHARNC)
2973       {
2974       /* Deal with UTF-8 characters that take up more than one byte. It's
2975       easier to write this out separately than try to macrify it. Use c to
2976       hold the length of the character in bytes, plus 0x80 to flag that it's a
2977       length rather than a small character. */
2978
2979 #ifdef SUPPORT_UTF8
2980       if (utf8 && (code[-1] & 0x80) != 0)
2981         {
2982         uschar *lastchar = code - 1;
2983         while((*lastchar & 0xc0) == 0x80) lastchar--;
2984         c = code - lastchar;            /* Length of UTF-8 character */
2985         memcpy(utf8_char, lastchar, c); /* Save the char */
2986         c |= 0x80;                      /* Flag c as a length */
2987         }
2988       else
2989 #endif
2990
2991       /* Handle the case of a single byte - either with no UTF8 support, or
2992       with UTF-8 disabled, or for a UTF-8 character < 128. */
2993
2994         {
2995         c = code[-1];
2996         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2997         }
2998
2999       /* If the repetition is unlimited, it pays to see if the next thing on
3000       the line is something that cannot possibly match this character. If so,
3001       automatically possessifying this item gains some performance in the case
3002       where the match fails. */
3003
3004       if (!possessive_quantifier &&
3005           repeat_max < 0 &&
3006           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3007             options, cd))
3008         {
3009         repeat_type = 0;    /* Force greedy */
3010         possessive_quantifier = TRUE;
3011         }
3012
3013       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3014       }
3015
3016     /* If previous was a single negated character ([^a] or similar), we use
3017     one of the special opcodes, replacing it. The code is shared with single-
3018     character repeats by setting opt_type to add a suitable offset into
3019     repeat_type. We can also test for auto-possessification. OP_NOT is
3020     currently used only for single-byte chars. */
3021
3022     else if (*previous == OP_NOT)
3023       {
3024       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3025       c = previous[1];
3026       if (!possessive_quantifier &&
3027           repeat_max < 0 &&
3028           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3029         {
3030         repeat_type = 0;    /* Force greedy */
3031         possessive_quantifier = TRUE;
3032         }
3033       goto OUTPUT_SINGLE_REPEAT;
3034       }
3035
3036     /* If previous was a character type match (\d or similar), abolish it and
3037     create a suitable repeat item. The code is shared with single-character
3038     repeats by setting op_type to add a suitable offset into repeat_type. Note
3039     the the Unicode property types will be present only when SUPPORT_UCP is
3040     defined, but we don't wrap the little bits of code here because it just
3041     makes it horribly messy. */
3042
3043     else if (*previous < OP_EODN)
3044       {
3045       uschar *oldcode;
3046       int prop_type, prop_value;
3047       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3048       c = *previous;
3049
3050       if (!possessive_quantifier &&
3051           repeat_max < 0 &&
3052           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3053         {
3054         repeat_type = 0;    /* Force greedy */
3055         possessive_quantifier = TRUE;
3056         }
3057
3058       OUTPUT_SINGLE_REPEAT:
3059       if (*previous == OP_PROP || *previous == OP_NOTPROP)
3060         {
3061         prop_type = previous[1];
3062         prop_value = previous[2];
3063         }
3064       else prop_type = prop_value = -1;
3065
3066       oldcode = code;
3067       code = previous;                  /* Usually overwrite previous item */
3068
3069       /* If the maximum is zero then the minimum must also be zero; Perl allows
3070       this case, so we do too - by simply omitting the item altogether. */
3071
3072       if (repeat_max == 0) goto END_REPEAT;
3073
3074       /* All real repeats make it impossible to handle partial matching (maybe
3075       one day we will be able to remove this restriction). */
3076
3077       if (repeat_max != 1) cd->nopartial = TRUE;
3078
3079       /* Combine the op_type with the repeat_type */
3080
3081       repeat_type += op_type;
3082
3083       /* A minimum of zero is handled either as the special case * or ?, or as
3084       an UPTO, with the maximum given. */
3085
3086       if (repeat_min == 0)
3087         {
3088         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3089           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3090         else
3091           {
3092           *code++ = OP_UPTO + repeat_type;
3093           PUT2INC(code, 0, repeat_max);
3094           }
3095         }
3096
3097       /* A repeat minimum of 1 is optimized into some special cases. If the
3098       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3099       left in place and, if the maximum is greater than 1, we use OP_UPTO with
3100       one less than the maximum. */
3101
3102       else if (repeat_min == 1)
3103         {
3104         if (repeat_max == -1)
3105           *code++ = OP_PLUS + repeat_type;
3106         else
3107           {
3108           code = oldcode;                 /* leave previous item in place */
3109           if (repeat_max == 1) goto END_REPEAT;
3110           *code++ = OP_UPTO + repeat_type;
3111           PUT2INC(code, 0, repeat_max - 1);
3112           }
3113         }
3114
3115       /* The case {n,n} is just an EXACT, while the general case {n,m} is
3116       handled as an EXACT followed by an UPTO. */
3117
3118       else
3119         {
3120         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3121         PUT2INC(code, 0, repeat_min);
3122
3123         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3124         we have to insert the character for the previous code. For a repeated
3125         Unicode property match, there are two extra bytes that define the
3126         required property. In UTF-8 mode, long characters have their length in
3127         c, with the 0x80 bit as a flag. */
3128
3129         if (repeat_max < 0)
3130           {
3131 #ifdef SUPPORT_UTF8
3132           if (utf8 && c >= 128)
3133             {
3134             memcpy(code, utf8_char, c & 7);
3135             code += c & 7;
3136             }
3137           else
3138 #endif
3139             {
3140             *code++ = c;
3141             if (prop_type >= 0)
3142               {
3143               *code++ = prop_type;
3144               *code++ = prop_value;
3145               }
3146             }
3147           *code++ = OP_STAR + repeat_type;
3148           }
3149
3150         /* Else insert an UPTO if the max is greater than the min, again
3151         preceded by the character, for the previously inserted code. If the
3152         UPTO is just for 1 instance, we can use QUERY instead. */
3153
3154         else if (repeat_max != repeat_min)
3155           {
3156 #ifdef SUPPORT_UTF8
3157           if (utf8 && c >= 128)
3158             {
3159             memcpy(code, utf8_char, c & 7);
3160             code += c & 7;
3161             }
3162           else
3163 #endif
3164           *code++ = c;
3165           if (prop_type >= 0)
3166             {
3167             *code++ = prop_type;
3168             *code++ = prop_value;
3169             }
3170           repeat_max -= repeat_min;
3171
3172           if (repeat_max == 1)
3173             {
3174             *code++ = OP_QUERY + repeat_type;
3175             }
3176           else
3177             {
3178             *code++ = OP_UPTO + repeat_type;
3179             PUT2INC(code, 0, repeat_max);
3180             }
3181           }
3182         }
3183
3184       /* The character or character type itself comes last in all cases. */
3185
3186 #ifdef SUPPORT_UTF8
3187       if (utf8 && c >= 128)
3188         {
3189         memcpy(code, utf8_char, c & 7);
3190         code += c & 7;
3191         }
3192       else
3193 #endif
3194       *code++ = c;
3195
3196       /* For a repeated Unicode property match, there are two extra bytes that
3197       define the required property. */
3198
3199 #ifdef SUPPORT_UCP
3200       if (prop_type >= 0)
3201         {
3202         *code++ = prop_type;
3203         *code++ = prop_value;
3204         }
3205 #endif
3206       }
3207
3208     /* If previous was a character class or a back reference, we put the repeat
3209     stuff after it, but just skip the item if the repeat was {0,0}. */
3210
3211     else if (*previous == OP_CLASS ||
3212              *previous == OP_NCLASS ||
3213 #ifdef SUPPORT_UTF8
3214              *previous == OP_XCLASS ||
3215 #endif
3216              *previous == OP_REF)
3217       {
3218       if (repeat_max == 0)
3219         {
3220         code = previous;
3221         goto END_REPEAT;
3222         }
3223
3224       /* All real repeats make it impossible to handle partial matching (maybe
3225       one day we will be able to remove this restriction). */
3226
3227       if (repeat_max != 1) cd->nopartial = TRUE;
3228
3229       if (repeat_min == 0 && repeat_max == -1)
3230         *code++ = OP_CRSTAR + repeat_type;
3231       else if (repeat_min == 1 && repeat_max == -1)
3232         *code++ = OP_CRPLUS + repeat_type;
3233       else if (repeat_min == 0 && repeat_max == 1)
3234         *code++ = OP_CRQUERY + repeat_type;
3235       else
3236         {
3237         *code++ = OP_CRRANGE + repeat_type;
3238         PUT2INC(code, 0, repeat_min);
3239         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3240         PUT2INC(code, 0, repeat_max);
3241         }
3242       }
3243
3244     /* If previous was a bracket group, we may have to replicate it in certain
3245     cases. */
3246
3247     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3248              *previous == OP_ONCE || *previous == OP_COND)
3249       {
3250       register int i;
3251       int ketoffset = 0;
3252       int len = code - previous;
3253       uschar *bralink = NULL;
3254
3255       /* Repeating a DEFINE group is pointless */
3256
3257       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3258         {
3259         *errorcodeptr = ERR55;
3260         goto FAILED;
3261         }
3262
3263       /* This is a paranoid check to stop integer overflow later on */
3264
3265       if (len > MAX_DUPLENGTH)
3266         {
3267         *errorcodeptr = ERR50;
3268         goto FAILED;
3269         }
3270
3271       /* If the maximum repeat count is unlimited, find the end of the bracket
3272       by scanning through from the start, and compute the offset back to it
3273       from the current code pointer. There may be an OP_OPT setting following
3274       the final KET, so we can't find the end just by going back from the code
3275       pointer. */
3276
3277       if (repeat_max == -1)
3278         {
3279         register uschar *ket = previous;
3280         do ket += GET(ket, 1); while (*ket != OP_KET);
3281         ketoffset = code - ket;
3282         }
3283
3284       /* The case of a zero minimum is special because of the need to stick
3285       OP_BRAZERO in front of it, and because the group appears once in the
3286       data, whereas in other cases it appears the minimum number of times. For
3287       this reason, it is simplest to treat this case separately, as otherwise
3288       the code gets far too messy. There are several special subcases when the
3289       minimum is zero. */
3290
3291       if (repeat_min == 0)
3292         {
3293         /* If the maximum is also zero, we just omit the group from the output
3294         altogether. */
3295
3296         if (repeat_max == 0)
3297           {
3298           code = previous;
3299           goto END_REPEAT;
3300           }
3301
3302         /* If the maximum is 1 or unlimited, we just have to stick in the
3303         BRAZERO and do no more at this point. However, we do need to adjust
3304         any OP_RECURSE calls inside the group that refer to the group itself or
3305         any internal or forward referenced group, because the offset is from
3306         the start of the whole regex. Temporarily terminate the pattern while
3307         doing this. */
3308
3309         if (repeat_max <= 1)
3310           {
3311           *code = OP_END;
3312           adjust_recurse(previous, 1, utf8, cd, save_hwm);
3313           memmove(previous+1, previous, len);
3314           code++;
3315           *previous++ = OP_BRAZERO + repeat_type;
3316           }
3317
3318         /* If the maximum is greater than 1 and limited, we have to replicate
3319         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3320         The first one has to be handled carefully because it's the original
3321         copy, which has to be moved up. The remainder can be handled by code
3322         that is common with the non-zero minimum case below. We have to
3323         adjust the value or repeat_max, since one less copy is required. Once
3324         again, we may have to adjust any OP_RECURSE calls inside the group. */
3325
3326         else
3327           {
3328           int offset;
3329           *code = OP_END;
3330           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3331           memmove(previous + 2 + LINK_SIZE, previous, len);
3332           code += 2 + LINK_SIZE;
3333           *previous++ = OP_BRAZERO + repeat_type;
3334           *previous++ = OP_BRA;
3335
3336           /* We chain together the bracket offset fields that have to be
3337           filled in later when the ends of the brackets are reached. */
3338
3339           offset = (bralink == NULL)? 0 : previous - bralink;
3340           bralink = previous;
3341           PUTINC(previous, 0, offset);
3342           }
3343
3344         repeat_max--;
3345         }
3346
3347       /* If the minimum is greater than zero, replicate the group as many
3348       times as necessary, and adjust the maximum to the number of subsequent
3349       copies that we need. If we set a first char from the group, and didn't
3350       set a required char, copy the latter from the former. If there are any
3351       forward reference subroutine calls in the group, there will be entries on
3352       the workspace list; replicate these with an appropriate increment. */
3353
3354       else
3355         {
3356         if (repeat_min > 1)
3357           {
3358           /* In the pre-compile phase, we don't actually do the replication. We
3359           just adjust the length as if we had. */
3360
3361           if (lengthptr != NULL)
3362             *lengthptr += (repeat_min - 1)*length_prevgroup;
3363
3364           /* This is compiling for real */
3365
3366           else
3367             {
3368             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3369             for (i = 1; i < repeat_min; i++)
3370               {
3371               uschar *hc;
3372               uschar *this_hwm = cd->hwm;
3373               memcpy(code, previous, len);
3374               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3375                 {
3376                 PUT(cd->hwm, 0, GET(hc, 0) + len);
3377                 cd->hwm += LINK_SIZE;
3378                 }
3379               save_hwm = this_hwm;
3380               code += len;
3381               }
3382             }
3383           }
3384
3385         if (repeat_max > 0) repeat_max -= repeat_min;
3386         }
3387
3388       /* This code is common to both the zero and non-zero minimum cases. If
3389       the maximum is limited, it replicates the group in a nested fashion,
3390       remembering the bracket starts on a stack. In the case of a zero minimum,
3391       the first one was set up above. In all cases the repeat_max now specifies
3392       the number of additional copies needed. Again, we must remember to
3393       replicate entries on the forward reference list. */
3394
3395       if (repeat_max >= 0)
3396         {
3397         /* In the pre-compile phase, we don't actually do the replication. We
3398         just adjust the length as if we had. For each repetition we must add 1
3399         to the length for BRAZERO and for all but the last repetition we must
3400         add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3401
3402         if (lengthptr != NULL && repeat_max > 0)
3403           *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3404             2 - 2*LINK_SIZE;  /* Last one doesn't nest */
3405
3406         /* This is compiling for real */
3407
3408         else for (i = repeat_max - 1; i >= 0; i--)
3409           {
3410           uschar *hc;
3411           uschar *this_hwm = cd->hwm;
3412
3413           *code++ = OP_BRAZERO + repeat_type;
3414
3415           /* All but the final copy start a new nesting, maintaining the
3416           chain of brackets outstanding. */
3417
3418           if (i != 0)
3419             {
3420             int offset;
3421             *code++ = OP_BRA;
3422             offset = (bralink == NULL)? 0 : code - bralink;
3423             bralink = code;
3424             PUTINC(code, 0, offset);
3425             }
3426
3427           memcpy(code, previous, len);
3428           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3429             {
3430             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3431             cd->hwm += LINK_SIZE;
3432             }
3433           save_hwm = this_hwm;
3434           code += len;
3435           }
3436
3437         /* Now chain through the pending brackets, and fill in their length
3438         fields (which are holding the chain links pro tem). */
3439
3440         while (bralink != NULL)
3441           {
3442           int oldlinkoffset;
3443           int offset = code - bralink + 1;
3444           uschar *bra = code - offset;
3445           oldlinkoffset = GET(bra, 1);
3446           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3447           *code++ = OP_KET;
3448           PUTINC(code, 0, offset);
3449           PUT(bra, 1, offset);
3450           }
3451         }
3452
3453       /* If the maximum is unlimited, set a repeater in the final copy. We
3454       can't just offset backwards from the current code point, because we
3455       don't know if there's been an options resetting after the ket. The
3456       correct offset was computed above.
3457
3458       Then, when we are doing the actual compile phase, check to see whether
3459       this group is a non-atomic one that could match an empty string. If so,
3460       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3461       that runtime checking can be done. [This check is also applied to
3462       atomic groups at runtime, but in a different way.] */
3463
3464       else
3465         {
3466         uschar *ketcode = code - ketoffset;
3467         uschar *bracode = ketcode - GET(ketcode, 1);
3468         *ketcode = OP_KETRMAX + repeat_type;
3469         if (lengthptr == NULL && *bracode != OP_ONCE)
3470           {
3471           uschar *scode = bracode;
3472           do
3473             {
3474             if (could_be_empty_branch(scode, ketcode, utf8))
3475               {
3476               *bracode += OP_SBRA - OP_BRA;
3477               break;
3478               }
3479             scode += GET(scode, 1);
3480             }
3481           while (*scode == OP_ALT);
3482           }
3483         }
3484       }
3485
3486     /* Else there's some kind of shambles */
3487
3488     else
3489       {
3490       *errorcodeptr = ERR11;
3491       goto FAILED;
3492       }
3493
3494     /* If the character following a repeat is '+', or if certain optimization
3495     tests above succeeded, possessive_quantifier is TRUE. For some of the
3496     simpler opcodes, there is an special alternative opcode for this. For
3497     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3498     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3499     but the special opcodes can optimize it a bit. The repeated item starts at
3500     tempcode, not at previous, which might be the first part of a string whose
3501     (former) last char we repeated.
3502
3503     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3504     an 'upto' may follow. We skip over an 'exact' item, and then test the
3505     length of what remains before proceeding. */
3506
3507     if (possessive_quantifier)
3508       {
3509       int len;
3510       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3511           *tempcode == OP_NOTEXACT)
3512         tempcode += _pcre_OP_lengths[*tempcode];
3513       len = code - tempcode;
3514       if (len > 0) switch (*tempcode)
3515         {
3516         case OP_STAR:  *tempcode = OP_POSSTAR; break;
3517         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
3518         case OP_QUERY: *tempcode = OP_POSQUERY; break;
3519         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
3520
3521         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
3522         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
3523         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3524         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
3525
3526         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
3527         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
3528         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3529         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
3530
3531         default:
3532         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3533         code += 1 + LINK_SIZE;
3534         len += 1 + LINK_SIZE;
3535         tempcode[0] = OP_ONCE;
3536         *code++ = OP_KET;
3537         PUTINC(code, 0, len);
3538         PUT(tempcode, 1, len);
3539         break;
3540         }
3541       }
3542
3543     /* In all case we no longer have a previous item. We also set the
3544     "follows varying string" flag for subsequently encountered reqbytes if
3545     it isn't already set and we have just passed a varying length item. */
3546
3547     END_REPEAT:
3548     previous = NULL;
3549     cd->req_varyopt |= reqvary;
3550     break;
3551
3552
3553     /* ===================================================================*/
3554     /* Start of nested parenthesized sub-expression, or comment or lookahead or
3555     lookbehind or option setting or condition or all the other extended
3556     parenthesis forms. First deal with the specials; all are introduced by ?,
3557     and the appearance of any of them means that this is not a capturing
3558     group. */
3559
3560     case '(':
3561     newoptions = options;
3562     skipbytes = 0;
3563     bravalue = OP_CBRA;
3564     save_hwm = cd->hwm;
3565
3566     if (*(++ptr) == '?')
3567       {
3568       int i, set, unset, namelen;
3569       int *optset;
3570       const uschar *name;
3571       uschar *slot;
3572
3573       switch (*(++ptr))
3574         {
3575         case '#':                 /* Comment; skip to ket */
3576         ptr++;
3577         while (*ptr != 0 && *ptr != ')') ptr++;
3578         if (*ptr == 0)
3579           {
3580           *errorcodeptr = ERR18;
3581           goto FAILED;
3582           }
3583         continue;
3584
3585
3586         /* ------------------------------------------------------------ */
3587         case ':':                 /* Non-capturing bracket */
3588         bravalue = OP_BRA;
3589         ptr++;
3590         break;
3591
3592
3593         /* ------------------------------------------------------------ */
3594         case '(':
3595         bravalue = OP_COND;       /* Conditional group */
3596
3597         /* A condition can be an assertion, a number (referring to a numbered
3598         group), a name (referring to a named group), or 'R', referring to
3599         recursion. R<digits> and R&name are also permitted for recursion tests.
3600
3601         There are several syntaxes for testing a named group: (?(name)) is used
3602         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3603
3604         There are two unfortunate ambiguities, caused by history. (a) 'R' can
3605         be the recursive thing or the name 'R' (and similarly for 'R' followed
3606         by digits), and (b) a number could be a name that consists of digits.
3607         In both cases, we look for a name first; if not found, we try the other
3608         cases. */
3609
3610         /* For conditions that are assertions, check the syntax, and then exit
3611         the switch. This will take control down to where bracketed groups,
3612         including assertions, are processed. */
3613
3614         if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3615           break;
3616
3617         /* Most other conditions use OP_CREF (a couple change to OP_RREF
3618         below), and all need to skip 3 bytes at the start of the group. */
3619
3620         code[1+LINK_SIZE] = OP_CREF;
3621         skipbytes = 3;
3622
3623         /* Check for a test for recursion in a named group. */
3624
3625         if (ptr[1] == 'R' && ptr[2] == '&')
3626           {
3627           terminator = -1;
3628           ptr += 2;
3629           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
3630           }
3631
3632         /* Check for a test for a named group's having been set, using the Perl
3633         syntax (?(<name>) or (?('name') */
3634
3635         else if (ptr[1] == '<')
3636           {
3637           terminator = '>';
3638           ptr++;
3639           }
3640         else if (ptr[1] == '\'')
3641           {
3642           terminator = '\'';
3643           ptr++;
3644           }
3645         else terminator = 0;
3646
3647         /* We now expect to read a name; any thing else is an error */
3648
3649         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3650           {
3651           ptr += 1;  /* To get the right offset */
3652           *errorcodeptr = ERR28;
3653           goto FAILED;
3654           }
3655
3656         /* Read the name, but also get it as a number if it's all digits */
3657
3658         recno = 0;
3659         name = ++ptr;
3660         while ((cd->ctypes[*ptr] & ctype_word) != 0)
3661           {
3662           if (recno >= 0)
3663             recno = ((digitab[*ptr] & ctype_digit) != 0)?
3664               recno * 10 + *ptr - '0' : -1;
3665           ptr++;
3666           }
3667         namelen = ptr - name;
3668
3669         if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3670           {
3671           ptr--;      /* Error offset */
3672           *errorcodeptr = ERR26;
3673           goto FAILED;
3674           }
3675
3676         /* Do no further checking in the pre-compile phase. */
3677
3678         if (lengthptr != NULL) break;
3679
3680         /* In the real compile we do the work of looking for the actual
3681         reference. */
3682
3683         slot = cd->name_table;
3684         for (i = 0; i < cd->names_found; i++)
3685           {
3686           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3687           slot += cd->name_entry_size;
3688           }
3689
3690         /* Found a previous named subpattern */
3691
3692         if (i < cd->names_found)
3693           {
3694           recno = GET2(slot, 0);
3695           PUT2(code, 2+LINK_SIZE, recno);
3696           }
3697
3698         /* Search the pattern for a forward reference */
3699
3700         else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3701                         (options & PCRE_EXTENDED) != 0)) > 0)
3702           {
3703           PUT2(code, 2+LINK_SIZE, i);
3704           }
3705
3706         /* If terminator == 0 it means that the name followed directly after
3707         the opening parenthesis [e.g. (?(abc)...] and in this case there are
3708         some further alternatives to try. For the cases where terminator != 0
3709         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3710         now checked all the possibilities, so give an error. */
3711
3712         else if (terminator != 0)
3713           {
3714           *errorcodeptr = ERR15;
3715           goto FAILED;
3716           }
3717
3718         /* Check for (?(R) for recursion. Allow digits after R to specify a
3719         specific group number. */
3720
3721         else if (*name == 'R')
3722           {
3723           recno = 0;
3724           for (i = 1; i < namelen; i++)
3725             {
3726             if ((digitab[name[i]] & ctype_digit) == 0)
3727               {
3728               *errorcodeptr = ERR15;
3729               goto FAILED;
3730               }
3731             recno = recno * 10 + name[i] - '0';
3732             }
3733           if (recno == 0) recno = RREF_ANY;
3734           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
3735           PUT2(code, 2+LINK_SIZE, recno);
3736           }
3737
3738         /* Similarly, check for the (?(DEFINE) "condition", which is always
3739         false. */
3740
3741         else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3742           {
3743           code[1+LINK_SIZE] = OP_DEF;
3744           skipbytes = 1;
3745           }
3746
3747         /* Check for the "name" actually being a subpattern number. */
3748
3749         else if (recno > 0)
3750           {
3751           PUT2(code, 2+LINK_SIZE, recno);
3752           }
3753
3754         /* Either an unidentified subpattern, or a reference to (?(0) */
3755
3756         else
3757           {
3758           *errorcodeptr = (recno == 0)? ERR35: ERR15;
3759           goto FAILED;
3760           }
3761         break;
3762
3763
3764         /* ------------------------------------------------------------ */
3765         case '=':                 /* Positive lookahead */
3766         bravalue = OP_ASSERT;
3767         ptr++;
3768         break;
3769
3770
3771         /* ------------------------------------------------------------ */
3772         case '!':                 /* Negative lookahead */
3773         bravalue = OP_ASSERT_NOT;
3774         ptr++;
3775         break;
3776
3777
3778         /* ------------------------------------------------------------ */
3779         case '<':                 /* Lookbehind or named define */
3780         switch (ptr[1])
3781           {
3782           case '=':               /* Positive lookbehind */
3783           bravalue = OP_ASSERTBACK;
3784           ptr += 2;
3785           break;
3786
3787           case '!':               /* Negative lookbehind */
3788           bravalue = OP_ASSERTBACK_NOT;
3789           ptr += 2;
3790           break;
3791
3792           default:                /* Could be name define, else bad */
3793           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3794           ptr++;                  /* Correct offset for error */
3795           *errorcodeptr = ERR24;
3796           goto FAILED;
3797           }
3798         break;
3799
3800
3801         /* ------------------------------------------------------------ */
3802         case '>':                 /* One-time brackets */
3803         bravalue = OP_ONCE;
3804         ptr++;
3805         break;
3806
3807
3808         /* ------------------------------------------------------------ */
3809         case 'C':                 /* Callout - may be followed by digits; */
3810         previous_callout = code;  /* Save for later completion */
3811         after_manual_callout = 1; /* Skip one item before completing */
3812         *code++ = OP_CALLOUT;
3813           {
3814           int n = 0;
3815           while ((digitab[*(++ptr)] & ctype_digit) != 0)
3816             n = n * 10 + *ptr - '0';
3817           if (*ptr != ')')
3818             {
3819             *errorcodeptr = ERR39;
3820             goto FAILED;
3821             }
3822           if (n > 255)
3823             {
3824             *errorcodeptr = ERR38;
3825             goto FAILED;
3826             }
3827           *code++ = n;
3828           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
3829           PUT(code, LINK_SIZE, 0);                    /* Default length */
3830           code += 2 * LINK_SIZE;
3831           }
3832         previous = NULL;
3833         continue;
3834
3835
3836         /* ------------------------------------------------------------ */
3837         case 'P':                 /* Python-style named subpattern handling */
3838         if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
3839           {
3840           is_recurse = *ptr == '>';
3841           terminator = ')';
3842           goto NAMED_REF_OR_RECURSE;
3843           }
3844         else if (*ptr != '<')    /* Test for Python-style definition */
3845           {
3846           *errorcodeptr = ERR41;
3847           goto FAILED;
3848           }
3849         /* Fall through to handle (?P< as (?< is handled */
3850
3851
3852         /* ------------------------------------------------------------ */
3853         DEFINE_NAME:    /* Come here from (?< handling */
3854         case '\'':
3855           {
3856           terminator = (*ptr == '<')? '>' : '\'';
3857           name = ++ptr;
3858
3859           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3860           namelen = ptr - name;
3861
3862           /* In the pre-compile phase, just do a syntax check. */
3863
3864           if (lengthptr != NULL)
3865             {
3866             if (*ptr != terminator)
3867               {
3868               *errorcodeptr = ERR42;
3869               goto FAILED;
3870               }
3871             if (cd->names_found >= MAX_NAME_COUNT)
3872               {
3873               *errorcodeptr = ERR49;
3874               goto FAILED;
3875               }
3876             if (namelen + 3 > cd->name_entry_size)
3877               {
3878               cd->name_entry_size = namelen + 3;
3879               if (namelen > MAX_NAME_SIZE)
3880                 {
3881                 *errorcodeptr = ERR48;
3882                 goto FAILED;
3883                 }
3884               }
3885             }
3886
3887           /* In the real compile, create the entry in the table */
3888
3889           else
3890             {
3891             slot = cd->name_table;
3892             for (i = 0; i < cd->names_found; i++)
3893               {
3894               int crc = memcmp(name, slot+2, namelen);
3895               if (crc == 0)
3896                 {
3897                 if (slot[2+namelen] == 0)
3898                   {
3899                   if ((options & PCRE_DUPNAMES) == 0)
3900                     {
3901                     *errorcodeptr = ERR43;
3902                     goto FAILED;
3903                     }
3904                   }
3905                 else crc = -1;      /* Current name is substring */
3906                 }
3907               if (crc < 0)
3908                 {
3909                 memmove(slot + cd->name_entry_size, slot,
3910                   (cd->names_found - i) * cd->name_entry_size);
3911                 break;
3912                 }
3913               slot += cd->name_entry_size;
3914               }
3915
3916             PUT2(slot, 0, cd->bracount + 1);
3917             memcpy(slot + 2, name, namelen);
3918             slot[2+namelen] = 0;
3919             }
3920           }
3921
3922         /* In both cases, count the number of names we've encountered. */
3923
3924         ptr++;                    /* Move past > or ' */
3925         cd->names_found++;
3926         goto NUMBERED_GROUP;
3927
3928
3929         /* ------------------------------------------------------------ */
3930         case '&':                 /* Perl recursion/subroutine syntax */
3931         terminator = ')';
3932         is_recurse = TRUE;
3933         /* Fall through */
3934
3935         /* We come here from the Python syntax above that handles both
3936         references (?P=name) and recursion (?P>name), as well as falling
3937         through from the Perl recursion syntax (?&name). */
3938
3939         NAMED_REF_OR_RECURSE:
3940         name = ++ptr;
3941         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3942         namelen = ptr - name;
3943
3944         /* In the pre-compile phase, do a syntax check and set a dummy
3945         reference number. */
3946
3947         if (lengthptr != NULL)
3948           {
3949           if (*ptr != terminator)
3950             {
3951             *errorcodeptr = ERR42;
3952             goto FAILED;
3953             }
3954           if (namelen > MAX_NAME_SIZE)
3955             {
3956             *errorcodeptr = ERR48;
3957             goto FAILED;
3958             }
3959           recno = 0;
3960           }
3961
3962         /* In the real compile, seek the name in the table */
3963
3964         else
3965           {
3966           slot = cd->name_table;
3967           for (i = 0; i < cd->names_found; i++)
3968             {
3969             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3970             slot += cd->name_entry_size;
3971             }
3972
3973           if (i < cd->names_found)         /* Back reference */
3974             {
3975             recno = GET2(slot, 0);
3976             }
3977           else if ((recno =                /* Forward back reference */
3978                     find_parens(ptr, cd->bracount, name, namelen,
3979                       (options & PCRE_EXTENDED) != 0)) <= 0)
3980             {
3981             *errorcodeptr = ERR15;
3982             goto FAILED;
3983             }
3984           }
3985
3986         /* In both phases, we can now go to the code than handles numerical
3987         recursion or backreferences. */
3988
3989         if (is_recurse) goto HANDLE_RECURSION;
3990           else goto HANDLE_REFERENCE;
3991
3992
3993         /* ------------------------------------------------------------ */
3994         case 'R':                 /* Recursion */
3995         ptr++;                    /* Same as (?0)      */
3996         /* Fall through */
3997
3998
3999         /* ------------------------------------------------------------ */
4000         case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4001         case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4002           {
4003           const uschar *called;
4004           recno = 0;
4005           while((digitab[*ptr] & ctype_digit) != 0)
4006             recno = recno * 10 + *ptr++ - '0';
4007           if (*ptr != ')')
4008             {
4009             *errorcodeptr = ERR29;
4010             goto FAILED;
4011             }
4012
4013           /* Come here from code above that handles a named recursion */
4014
4015           HANDLE_RECURSION:
4016
4017           previous = code;
4018           called = cd->start_code;
4019
4020           /* When we are actually compiling, find the bracket that is being
4021           referenced. Temporarily end the regex in case it doesn't exist before
4022           this point. If we end up with a forward reference, first check that
4023           the bracket does occur later so we can give the error (and position)
4024           now. Then remember this forward reference in the workspace so it can
4025           be filled in at the end. */
4026
4027           if (lengthptr == NULL)
4028             {
4029             *code = OP_END;
4030             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4031
4032             /* Forward reference */
4033
4034             if (called == NULL)
4035               {
4036               if (find_parens(ptr, cd->bracount, NULL, recno,
4037                    (options & PCRE_EXTENDED) != 0) < 0)
4038                 {
4039                 *errorcodeptr = ERR15;
4040                 goto FAILED;
4041                 }
4042               called = cd->start_code + recno;
4043               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4044               }
4045
4046             /* If not a forward reference, and the subpattern is still open,
4047             this is a recursive call. We check to see if this is a left
4048             recursion that could loop for ever, and diagnose that case. */
4049
4050             else if (GET(called, 1) == 0 &&
4051                      could_be_empty(called, code, bcptr, utf8))
4052               {
4053               *errorcodeptr = ERR40;
4054               goto FAILED;
4055               }
4056             }
4057
4058           /* Insert the recursion/subroutine item, automatically wrapped inside
4059           "once" brackets. Set up a "previous group" length so that a
4060           subsequent quantifier will work. */
4061
4062           *code = OP_ONCE;
4063           PUT(code, 1, 2 + 2*LINK_SIZE);
4064           code += 1 + LINK_SIZE;
4065
4066           *code = OP_RECURSE;
4067           PUT(code, 1, called - cd->start_code);
4068           code += 1 + LINK_SIZE;
4069
4070           *code = OP_KET;
4071           PUT(code, 1, 2 + 2*LINK_SIZE);
4072           code += 1 + LINK_SIZE;
4073
4074           length_prevgroup = 3 + 3*LINK_SIZE;
4075           }
4076
4077         /* Can't determine a first byte now */
4078
4079         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4080         continue;
4081
4082
4083         /* ------------------------------------------------------------ */
4084         default:              /* Other characters: check option setting */
4085         set = unset = 0;
4086         optset = &set;
4087
4088         while (*ptr != ')' && *ptr != ':')
4089           {
4090           switch (*ptr++)
4091             {
4092             case '-': optset = &unset; break;
4093
4094             case 'J':    /* Record that it changed in the external options */
4095             *optset |= PCRE_DUPNAMES;
4096             cd->external_options |= PCRE_JCHANGED;
4097             break;
4098
4099             case 'i': *optset |= PCRE_CASELESS; break;
4100             case 'm': *optset |= PCRE_MULTILINE; break;
4101             case 's': *optset |= PCRE_DOTALL; break;
4102             case 'x': *optset |= PCRE_EXTENDED; break;
4103             case 'U': *optset |= PCRE_UNGREEDY; break;
4104             case 'X': *optset |= PCRE_EXTRA; break;
4105
4106             default:  *errorcodeptr = ERR12;
4107                       ptr--;    /* Correct the offset */
4108                       goto FAILED;
4109             }
4110           }
4111
4112         /* Set up the changed option bits, but don't change anything yet. */
4113
4114         newoptions = (options | set) & (~unset);
4115
4116         /* If the options ended with ')' this is not the start of a nested
4117         group with option changes, so the options change at this level. If this
4118         item is right at the start of the pattern, the options can be
4119         abstracted and made external in the pre-compile phase, and ignored in
4120         the compile phase. This can be helpful when matching -- for instance in
4121         caseless checking of required bytes.
4122
4123         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4124         definitely *not* at the start of the pattern because something has been
4125         compiled. In the pre-compile phase, however, the code pointer can have
4126         that value after the start, because it gets reset as code is discarded
4127         during the pre-compile. However, this can happen only at top level - if
4128         we are within parentheses, the starting BRA will still be present. At
4129         any parenthesis level, the length value can be used to test if anything
4130         has been compiled at that level. Thus, a test for both these conditions
4131         is necessary to ensure we correctly detect the start of the pattern in
4132         both phases.
4133
4134         If we are not at the pattern start, compile code to change the ims
4135         options if this setting actually changes any of them. We also pass the
4136         new setting back so that it can be put at the start of any following
4137         branches, and when this group ends (if we are in a group), a resetting
4138         item can be compiled. */
4139
4140         if (*ptr == ')')
4141           {
4142           if (code == cd->start_code + 1 + LINK_SIZE &&
4143                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4144             {
4145             cd->external_options = newoptions;
4146             options = newoptions;
4147             }
4148          else
4149             {
4150             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4151               {
4152               *code++ = OP_OPT;
4153               *code++ = newoptions & PCRE_IMS;
4154               }
4155
4156             /* Change options at this level, and pass them back for use
4157             in subsequent branches. Reset the greedy defaults and the case
4158             value for firstbyte and reqbyte. */
4159
4160             *optionsptr = options = newoptions;
4161             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4162             greedy_non_default = greedy_default ^ 1;
4163             req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4164             }
4165
4166           previous = NULL;       /* This item can't be repeated */
4167           continue;              /* It is complete */
4168           }
4169
4170         /* If the options ended with ':' we are heading into a nested group
4171         with possible change of options. Such groups are non-capturing and are
4172         not assertions of any kind. All we need to do is skip over the ':';
4173         the newoptions value is handled below. */
4174
4175         bravalue = OP_BRA;
4176         ptr++;
4177         }     /* End of switch for character following (? */
4178       }       /* End of (? handling */
4179
4180     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4181     all unadorned brackets become non-capturing and behave like (?:...)
4182     brackets. */
4183
4184     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4185       {
4186       bravalue = OP_BRA;
4187       }
4188
4189     /* Else we have a capturing group. */
4190
4191     else
4192       {
4193       NUMBERED_GROUP:
4194       cd->bracount += 1;
4195       PUT2(code, 1+LINK_SIZE, cd->bracount);
4196       skipbytes = 2;
4197       }
4198
4199     /* Process nested bracketed regex. Assertions may not be repeated, but
4200     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4201     non-register variable in order to be able to pass its address because some
4202     compilers complain otherwise. Pass in a new setting for the ims options if
4203     they have changed. */
4204
4205     previous = (bravalue >= OP_ONCE)? code : NULL;
4206     *code = bravalue;
4207     tempcode = code;
4208     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4209     length_prevgroup = 0;              /* Initialize for pre-compile phase */
4210
4211     if (!compile_regex(
4212          newoptions,                   /* The complete new option state */
4213          options & PCRE_IMS,           /* The previous ims option state */
4214          &tempcode,                    /* Where to put code (updated) */
4215          &ptr,                         /* Input pointer (updated) */
4216          errorcodeptr,                 /* Where to put an error message */
4217          (bravalue == OP_ASSERTBACK ||
4218           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4219          skipbytes,                    /* Skip over bracket number */
4220          &subfirstbyte,                /* For possible first char */
4221          &subreqbyte,                  /* For possible last char */
4222          bcptr,                        /* Current branch chain */
4223          cd,                           /* Tables block */
4224          (lengthptr == NULL)? NULL :   /* Actual compile phase */
4225            &length_prevgroup           /* Pre-compile phase */
4226          ))
4227       goto FAILED;
4228
4229     /* At the end of compiling, code is still pointing to the start of the
4230     group, while tempcode has been updated to point past the end of the group
4231     and any option resetting that may follow it. The pattern pointer (ptr)
4232     is on the bracket. */
4233
4234     /* If this is a conditional bracket, check that there are no more than
4235     two branches in the group, or just one if it's a DEFINE group. */
4236
4237     if (bravalue == OP_COND)
4238       {
4239       uschar *tc = code;
4240       int condcount = 0;
4241
4242       do {
4243          condcount++;
4244          tc += GET(tc,1);
4245          }
4246       while (*tc != OP_KET);
4247
4248       /* A DEFINE group is never obeyed inline (the "condition" is always
4249       false). It must have only one branch. */
4250
4251       if (code[LINK_SIZE+1] == OP_DEF)
4252         {
4253         if (condcount > 1)
4254           {
4255           *errorcodeptr = ERR54;
4256           goto FAILED;
4257           }
4258         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4259         }
4260
4261       /* A "normal" conditional group. If there is just one branch, we must not
4262       make use of its firstbyte or reqbyte, because this is equivalent to an
4263       empty second branch. */
4264
4265       else
4266         {
4267         if (condcount > 2)
4268           {
4269           *errorcodeptr = ERR27;
4270           goto FAILED;
4271           }
4272         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4273         }
4274       }
4275
4276     /* Error if hit end of pattern */
4277
4278     if (*ptr != ')')
4279       {
4280       *errorcodeptr = ERR14;
4281       goto FAILED;
4282       }
4283
4284     /* In the pre-compile phase, update the length by the length of the nested
4285     group, less the brackets at either end. Then reduce the compiled code to
4286     just the brackets so that it doesn't use much memory if it is duplicated by
4287     a quantifier. */
4288
4289     if (lengthptr != NULL)
4290       {
4291       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4292       code++;
4293       PUTINC(code, 0, 1 + LINK_SIZE);
4294       *code++ = OP_KET;
4295       PUTINC(code, 0, 1 + LINK_SIZE);
4296       }
4297
4298     /* Otherwise update the main code pointer to the end of the group. */
4299
4300     else code = tempcode;
4301
4302     /* For a DEFINE group, required and first character settings are not
4303     relevant. */
4304
4305     if (bravalue == OP_DEF) break;
4306
4307     /* Handle updating of the required and first characters for other types of
4308     group. Update for normal brackets of all kinds, and conditions with two
4309     branches (see code above). If the bracket is followed by a quantifier with
4310     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4311     zerofirstbyte outside the main loop so that they can be accessed for the
4312     back off. */
4313
4314     zeroreqbyte = reqbyte;
4315     zerofirstbyte = firstbyte;
4316     groupsetfirstbyte = FALSE;
4317
4318     if (bravalue >= OP_ONCE)
4319       {
4320       /* If we have not yet set a firstbyte in this branch, take it from the
4321       subpattern, remembering that it was set here so that a repeat of more
4322       than one can replicate it as reqbyte if necessary. If the subpattern has
4323       no firstbyte, set "none" for the whole branch. In both cases, a zero
4324       repeat forces firstbyte to "none". */
4325
4326       if (firstbyte == REQ_UNSET)
4327         {
4328         if (subfirstbyte >= 0)
4329           {
4330           firstbyte = subfirstbyte;
4331           groupsetfirstbyte = TRUE;
4332           }
4333         else firstbyte = REQ_NONE;
4334         zerofirstbyte = REQ_NONE;
4335         }
4336
4337       /* If firstbyte was previously set, convert the subpattern's firstbyte
4338       into reqbyte if there wasn't one, using the vary flag that was in
4339       existence beforehand. */
4340
4341       else if (subfirstbyte >= 0 && subreqbyte < 0)
4342         subreqbyte = subfirstbyte | tempreqvary;
4343
4344       /* If the subpattern set a required byte (or set a first byte that isn't
4345       really the first byte - see above), set it. */
4346
4347       if (subreqbyte >= 0) reqbyte = subreqbyte;
4348       }
4349
4350     /* For a forward assertion, we take the reqbyte, if set. This can be
4351     helpful if the pattern that follows the assertion doesn't set a different
4352     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4353     for an assertion, however because it leads to incorrect effect for patterns
4354     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4355     of a firstbyte. This is overcome by a scan at the end if there's no
4356     firstbyte, looking for an asserted first char. */
4357
4358     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4359     break;     /* End of processing '(' */
4360
4361
4362     /* ===================================================================*/
4363     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4364     are arranged to be the negation of the corresponding OP_values. For the
4365     back references, the values are ESC_REF plus the reference number. Only
4366     back references and those types that consume a character may be repeated.
4367     We can test for values between ESC_b and ESC_Z for the latter; this may
4368     have to change if any new ones are ever created. */
4369
4370     case '\\':
4371     tempptr = ptr;
4372     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4373     if (*errorcodeptr != 0) goto FAILED;
4374
4375     if (c < 0)
4376       {
4377       if (-c == ESC_Q)            /* Handle start of quoted string */
4378         {
4379         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4380           else inescq = TRUE;
4381         continue;
4382         }
4383
4384       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
4385
4386       /* For metasequences that actually match a character, we disable the
4387       setting of a first character if it hasn't already been set. */
4388
4389       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4390         firstbyte = REQ_NONE;
4391
4392       /* Set values to reset to if this is followed by a zero repeat. */
4393
4394       zerofirstbyte = firstbyte;
4395       zeroreqbyte = reqbyte;
4396
4397       /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4398
4399       if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4400         {
4401         is_recurse = FALSE;
4402         terminator = (*(++ptr) == '<')? '>' : '\'';
4403         goto NAMED_REF_OR_RECURSE;
4404         }
4405
4406       /* Back references are handled specially; must disable firstbyte if
4407       not set to cope with cases like (?=(\w+))\1: which would otherwise set
4408       ':' later. */
4409
4410       if (-c >= ESC_REF)
4411         {
4412         recno = -c - ESC_REF;
4413
4414         HANDLE_REFERENCE:    /* Come here from named backref handling */
4415         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4416         previous = code;
4417         *code++ = OP_REF;
4418         PUT2INC(code, 0, recno);
4419         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4420         if (recno > cd->top_backref) cd->top_backref = recno;
4421         }
4422
4423       /* So are Unicode property matches, if supported. */
4424
4425 #ifdef SUPPORT_UCP
4426       else if (-c == ESC_P || -c == ESC_p)
4427         {
4428         BOOL negated;
4429         int pdata;
4430         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4431         if (ptype < 0) goto FAILED;
4432         previous = code;
4433         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4434         *code++ = ptype;
4435         *code++ = pdata;
4436         }
4437 #else
4438
4439       /* If Unicode properties are not supported, \X, \P, and \p are not
4440       allowed. */
4441
4442       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4443         {
4444         *errorcodeptr = ERR45;
4445         goto FAILED;
4446         }
4447 #endif
4448
4449       /* For the rest (including \X when Unicode properties are supported), we
4450       can obtain the OP value by negating the escape value. */
4451
4452       else
4453         {
4454         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4455         *code++ = -c;
4456         }
4457       continue;
4458       }
4459
4460     /* We have a data character whose value is in c. In UTF-8 mode it may have
4461     a value > 127. We set its representation in the length/buffer, and then
4462     handle it as a data character. */
4463
4464 #ifdef SUPPORT_UTF8
4465     if (utf8 && c > 127)
4466       mclength = _pcre_ord2utf8(c, mcbuffer);
4467     else
4468 #endif
4469
4470      {
4471      mcbuffer[0] = c;
4472      mclength = 1;
4473      }
4474     goto ONE_CHAR;
4475
4476
4477     /* ===================================================================*/
4478     /* Handle a literal character. It is guaranteed not to be whitespace or #
4479     when the extended flag is set. If we are in UTF-8 mode, it may be a
4480     multi-byte literal character. */
4481
4482     default:
4483     NORMAL_CHAR:
4484     mclength = 1;
4485     mcbuffer[0] = c;
4486
4487 #ifdef SUPPORT_UTF8
4488     if (utf8 && c >= 0xc0)
4489       {
4490       while ((ptr[1] & 0xc0) == 0x80)
4491         mcbuffer[mclength++] = *(++ptr);
4492       }
4493 #endif
4494
4495     /* At this point we have the character's bytes in mcbuffer, and the length
4496     in mclength. When not in UTF-8 mode, the length is always 1. */
4497
4498     ONE_CHAR:
4499     previous = code;
4500     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4501     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4502
4503     /* Set the first and required bytes appropriately. If no previous first
4504     byte, set it from this character, but revert to none on a zero repeat.
4505     Otherwise, leave the firstbyte value alone, and don't change it on a zero
4506     repeat. */
4507
4508     if (firstbyte == REQ_UNSET)
4509       {
4510       zerofirstbyte = REQ_NONE;
4511       zeroreqbyte = reqbyte;
4512
4513       /* If the character is more than one byte long, we can set firstbyte
4514       only if it is not to be matched caselessly. */
4515
4516       if (mclength == 1 || req_caseopt == 0)
4517         {
4518         firstbyte = mcbuffer[0] | req_caseopt;
4519         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4520         }
4521       else firstbyte = reqbyte = REQ_NONE;
4522       }
4523
4524     /* firstbyte was previously set; we can set reqbyte only the length is
4525     1 or the matching is caseful. */
4526
4527     else
4528       {
4529       zerofirstbyte = firstbyte;
4530       zeroreqbyte = reqbyte;
4531       if (mclength == 1 || req_caseopt == 0)
4532         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4533       }
4534
4535     break;            /* End of literal character handling */
4536     }
4537   }                   /* end of big loop */
4538
4539
4540 /* Control never reaches here by falling through, only by a goto for all the
4541 error states. Pass back the position in the pattern so that it can be displayed
4542 to the user for diagnosing the error. */
4543
4544 FAILED:
4545 *ptrptr = ptr;
4546 return FALSE;
4547 }
4548
4549
4550
4551
4552 /*************************************************
4553 *     Compile sequence of alternatives           *
4554 *************************************************/
4555
4556 /* On entry, ptr is pointing past the bracket character, but on return it
4557 points to the closing bracket, or vertical bar, or end of string. The code
4558 variable is pointing at the byte into which the BRA operator has been stored.
4559 If the ims options are changed at the start (for a (?ims: group) or during any
4560 branch, we need to insert an OP_OPT item at the start of every following branch
4561 to ensure they get set correctly at run time, and also pass the new options
4562 into every subsequent branch compile.
4563
4564 This function is used during the pre-compile phase when we are trying to find
4565 out the amount of memory needed, as well as during the real compile phase. The
4566 value of lengthptr distinguishes the two phases.
4567
4568 Argument:
4569   options        option bits, including any changes for this subpattern
4570   oldims         previous settings of ims option bits
4571   codeptr        -> the address of the current code pointer
4572   ptrptr         -> the address of the current pattern pointer
4573   errorcodeptr   -> pointer to error code variable
4574   lookbehind     TRUE if this is a lookbehind assertion
4575   skipbytes      skip this many bytes at start (for brackets and OP_COND)
4576   firstbyteptr   place to put the first required character, or a negative number
4577   reqbyteptr     place to put the last required character, or a negative number
4578   bcptr          pointer to the chain of currently open branches
4579   cd             points to the data block with tables pointers etc.
4580   lengthptr      NULL during the real compile phase
4581                  points to length accumulator during pre-compile phase
4582
4583 Returns:         TRUE on success
4584 */
4585
4586 static BOOL
4587 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4588   int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4589   int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4590 {
4591 const uschar *ptr = *ptrptr;
4592 uschar *code = *codeptr;
4593 uschar *last_branch = code;
4594 uschar *start_bracket = code;
4595 uschar *reverse_count = NULL;
4596 int firstbyte, reqbyte;
4597 int branchfirstbyte, branchreqbyte;
4598 int length;
4599 branch_chain bc;
4600
4601 bc.outer = bcptr;
4602 bc.current = code;
4603
4604 firstbyte = reqbyte = REQ_UNSET;
4605
4606 /* Accumulate the length for use in the pre-compile phase. Start with the
4607 length of the BRA and KET and any extra bytes that are required at the
4608 beginning. We accumulate in a local variable to save frequent testing of
4609 lenthptr for NULL. We cannot do this by looking at the value of code at the
4610 start and end of each alternative, because compiled items are discarded during
4611 the pre-compile phase so that the work space is not exceeded. */
4612
4613 length = 2 + 2*LINK_SIZE + skipbytes;
4614
4615 /* WARNING: If the above line is changed for any reason, you must also change
4616 the code that abstracts option settings at the start of the pattern and makes
4617 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4618 pre-compile phase to find out whether anything has yet been compiled or not. */
4619
4620 /* Offset is set zero to mark that this bracket is still open */
4621
4622 PUT(code, 1, 0);
4623 code += 1 + LINK_SIZE + skipbytes;
4624
4625 /* Loop for each alternative branch */
4626
4627 for (;;)
4628   {
4629   /* Handle a change of ims options at the start of the branch */
4630
4631   if ((options & PCRE_IMS) != oldims)
4632     {
4633     *code++ = OP_OPT;
4634     *code++ = options & PCRE_IMS;
4635     length += 2;
4636     }
4637
4638   /* Set up dummy OP_REVERSE if lookbehind assertion */
4639
4640   if (lookbehind)
4641     {
4642     *code++ = OP_REVERSE;
4643     reverse_count = code;
4644     PUTINC(code, 0, 0);
4645     length += 1 + LINK_SIZE;
4646     }
4647
4648   /* Now compile the branch; in the pre-compile phase its length gets added
4649   into the length. */
4650
4651   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4652         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4653     {
4654     *ptrptr = ptr;
4655     return FALSE;
4656     }
4657
4658   /* In the real compile phase, there is some post-processing to be done. */
4659
4660   if (lengthptr == NULL)
4661     {
4662     /* If this is the first branch, the firstbyte and reqbyte values for the
4663     branch become the values for the regex. */
4664
4665     if (*last_branch != OP_ALT)
4666       {
4667       firstbyte = branchfirstbyte;
4668       reqbyte = branchreqbyte;
4669       }
4670
4671     /* If this is not the first branch, the first char and reqbyte have to
4672     match the values from all the previous branches, except that if the
4673     previous value for reqbyte didn't have REQ_VARY set, it can still match,
4674     and we set REQ_VARY for the regex. */
4675
4676     else
4677       {
4678       /* If we previously had a firstbyte, but it doesn't match the new branch,
4679       we have to abandon the firstbyte for the regex, but if there was
4680       previously no reqbyte, it takes on the value of the old firstbyte. */
4681
4682       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4683         {
4684         if (reqbyte < 0) reqbyte = firstbyte;
4685         firstbyte = REQ_NONE;
4686         }
4687
4688       /* If we (now or from before) have no firstbyte, a firstbyte from the
4689       branch becomes a reqbyte if there isn't a branch reqbyte. */
4690
4691       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4692           branchreqbyte = branchfirstbyte;
4693
4694       /* Now ensure that the reqbytes match */
4695
4696       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4697         reqbyte = REQ_NONE;
4698       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
4699       }
4700
4701     /* If lookbehind, check that this branch matches a fixed-length string, and
4702     put the length into the OP_REVERSE item. Temporarily mark the end of the
4703     branch with OP_END. */
4704
4705     if (lookbehind)
4706       {
4707       int fixed_length;
4708       *code = OP_END;
4709       fixed_length = find_fixedlength(last_branch, options);
4710       DPRINTF(("fixed length = %d\n", fixed_length));
4711       if (fixed_length < 0)
4712         {
4713         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4714         *ptrptr = ptr;
4715         return FALSE;
4716         }
4717       PUT(reverse_count, 0, fixed_length);
4718       }
4719     }
4720
4721   /* Reached end of expression, either ')' or end of pattern. Go back through
4722   the alternative branches and reverse the chain of offsets, with the field in
4723   the BRA item now becoming an offset to the first alternative. If there are
4724   no alternatives, it points to the end of the group. The length in the
4725   terminating ket is always the length of the whole bracketed item. If any of
4726   the ims options were changed inside the group, compile a resetting op-code
4727   following, except at the very end of the pattern. Return leaving the pointer
4728   at the terminating char. */
4729
4730   if (*ptr != '|')
4731     {
4732     int branch_length = code - last_branch;
4733     do
4734       {
4735       int prev_length = GET(last_branch, 1);
4736       PUT(last_branch, 1, branch_length);
4737       branch_length = prev_length;
4738       last_branch -= branch_length;
4739       }
4740     while (branch_length > 0);
4741
4742     /* Fill in the ket */
4743
4744     *code = OP_KET;
4745     PUT(code, 1, code - start_bracket);
4746     code += 1 + LINK_SIZE;
4747
4748     /* Resetting option if needed */
4749
4750     if ((options & PCRE_IMS) != oldims && *ptr == ')')
4751       {
4752       *code++ = OP_OPT;
4753       *code++ = oldims;
4754       length += 2;
4755       }
4756
4757     /* Set values to pass back */
4758
4759     *codeptr = code;
4760     *ptrptr = ptr;
4761     *firstbyteptr = firstbyte;
4762     *reqbyteptr = reqbyte;
4763     if (lengthptr != NULL) *lengthptr += length;
4764     return TRUE;
4765     }
4766
4767   /* Another branch follows; insert an "or" node. Its length field points back
4768   to the previous branch while the bracket remains open. At the end the chain
4769   is reversed. It's done like this so that the start of the bracket has a
4770   zero offset until it is closed, making it possible to detect recursion. */
4771
4772   *code = OP_ALT;
4773   PUT(code, 1, code - last_branch);
4774   bc.current = last_branch = code;
4775   code += 1 + LINK_SIZE;
4776   ptr++;
4777   length += 1 + LINK_SIZE;
4778   }
4779 /* Control never reaches here */
4780 }
4781
4782
4783
4784
4785 /*************************************************
4786 *          Check for anchored expression         *
4787 *************************************************/
4788
4789 /* Try to find out if this is an anchored regular expression. Consider each
4790 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4791 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4792 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4793 counts, since OP_CIRC can match in the middle.
4794
4795 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4796 This is the code for \G, which means "match at start of match position, taking
4797 into account the match offset".
4798
4799 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4800 because that will try the rest of the pattern at all possible matching points,
4801 so there is no point trying again.... er ....
4802
4803 .... except when the .* appears inside capturing parentheses, and there is a
4804 subsequent back reference to those parentheses. We haven't enough information
4805 to catch that case precisely.
4806
4807 At first, the best we could do was to detect when .* was in capturing brackets
4808 and the highest back reference was greater than or equal to that level.
4809 However, by keeping a bitmap of the first 31 back references, we can catch some
4810 of the more common cases more precisely.
4811
4812 Arguments:
4813   code           points to start of expression (the bracket)
4814   options        points to the options setting
4815   bracket_map    a bitmap of which brackets we are inside while testing; this
4816                   handles up to substring 31; after that we just have to take
4817                   the less precise approach
4818   backref_map    the back reference bitmap
4819
4820 Returns:     TRUE or FALSE
4821 */
4822
4823 static BOOL
4824 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4825   unsigned int backref_map)
4826 {
4827 do {
4828    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4829      options, PCRE_MULTILINE, FALSE);
4830    register int op = *scode;
4831
4832    /* Non-capturing brackets */
4833
4834    if (op == OP_BRA)
4835      {
4836      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4837      }
4838
4839    /* Capturing brackets */
4840
4841    else if (op == OP_CBRA)
4842      {
4843      int n = GET2(scode, 1+LINK_SIZE);
4844      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4845      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4846      }
4847
4848    /* Other brackets */
4849
4850    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4851      {
4852      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4853      }
4854
4855    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4856    are or may be referenced. */
4857
4858    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4859              op == OP_TYPEPOSSTAR) &&
4860             (*options & PCRE_DOTALL) != 0)
4861      {
4862      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4863      }
4864
4865    /* Check for explicit anchoring */
4866
4867    else if (op != OP_SOD && op != OP_SOM &&
4868            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4869      return FALSE;
4870    code += GET(code, 1);
4871    }
4872 while (*code == OP_ALT);   /* Loop for each alternative */
4873 return TRUE;
4874 }
4875
4876
4877
4878 /*************************************************
4879 *         Check for starting with ^ or .*        *
4880 *************************************************/
4881
4882 /* This is called to find out if every branch starts with ^ or .* so that
4883 "first char" processing can be done to speed things up in multiline
4884 matching and for non-DOTALL patterns that start with .* (which must start at
4885 the beginning or after \n). As in the case of is_anchored() (see above), we
4886 have to take account of back references to capturing brackets that contain .*
4887 because in that case we can't make the assumption.
4888
4889 Arguments:
4890   code           points to start of expression (the bracket)
4891   bracket_map    a bitmap of which brackets we are inside while testing; this
4892                   handles up to substring 31; after that we just have to take
4893                   the less precise approach
4894   backref_map    the back reference bitmap
4895
4896 Returns:         TRUE or FALSE
4897 */
4898
4899 static BOOL
4900 is_startline(const uschar *code, unsigned int bracket_map,
4901   unsigned int backref_map)
4902 {
4903 do {
4904    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4905      NULL, 0, FALSE);
4906    register int op = *scode;
4907
4908    /* Non-capturing brackets */
4909
4910    if (op == OP_BRA)
4911      {
4912      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4913      }
4914
4915    /* Capturing brackets */
4916
4917    else if (op == OP_CBRA)
4918      {
4919      int n = GET2(scode, 1+LINK_SIZE);
4920      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4921      if (!is_startline(scode, new_map, backref_map)) return FALSE;
4922      }
4923
4924    /* Other brackets */
4925
4926    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4927      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4928
4929    /* .* means "start at start or after \n" if it isn't in brackets that
4930    may be referenced. */
4931
4932    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4933      {
4934      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4935      }
4936
4937    /* Check for explicit circumflex */
4938
4939    else if (op != OP_CIRC) return FALSE;
4940
4941    /* Move on to the next alternative */
4942
4943    code += GET(code, 1);
4944    }
4945 while (*code == OP_ALT);  /* Loop for each alternative */
4946 return TRUE;
4947 }
4948
4949
4950
4951 /*************************************************
4952 *       Check for asserted fixed first char      *
4953 *************************************************/
4954
4955 /* During compilation, the "first char" settings from forward assertions are
4956 discarded, because they can cause conflicts with actual literals that follow.
4957 However, if we end up without a first char setting for an unanchored pattern,
4958 it is worth scanning the regex to see if there is an initial asserted first
4959 char. If all branches start with the same asserted char, or with a bracket all
4960 of whose alternatives start with the same asserted char (recurse ad lib), then
4961 we return that char, otherwise -1.
4962
4963 Arguments:
4964   code       points to start of expression (the bracket)
4965   options    pointer to the options (used to check casing changes)
4966   inassert   TRUE if in an assertion
4967
4968 Returns:     -1 or the fixed first char
4969 */
4970
4971 static int
4972 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4973 {
4974 register int c = -1;
4975 do {
4976    int d;
4977    const uschar *scode =
4978      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4979    register int op = *scode;
4980
4981    switch(op)
4982      {
4983      default:
4984      return -1;
4985
4986      case OP_BRA:
4987      case OP_CBRA:
4988      case OP_ASSERT:
4989      case OP_ONCE:
4990      case OP_COND:
4991      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4992        return -1;
4993      if (c < 0) c = d; else if (c != d) return -1;
4994      break;
4995
4996      case OP_EXACT:       /* Fall through */
4997      scode += 2;
4998
4999      case OP_CHAR:
5000      case OP_CHARNC:
5001      case OP_PLUS:
5002      case OP_MINPLUS:
5003      case OP_POSPLUS:
5004      if (!inassert) return -1;
5005      if (c < 0)
5006        {
5007        c = scode[1];
5008        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5009        }
5010      else if (c != scode[1]) return -1;
5011      break;
5012      }
5013
5014    code += GET(code, 1);
5015    }
5016 while (*code == OP_ALT);
5017 return c;
5018 }
5019
5020
5021
5022 /*************************************************
5023 *        Compile a Regular Expression            *
5024 *************************************************/
5025
5026 /* This function takes a string and returns a pointer to a block of store
5027 holding a compiled version of the expression. The original API for this
5028 function had no error code return variable; it is retained for backwards
5029 compatibility. The new function is given a new name.
5030
5031 Arguments:
5032   pattern       the regular expression
5033   options       various option bits
5034   errorcodeptr  pointer to error code variable (pcre_compile2() only)
5035                   can be NULL if you don't want a code value
5036   errorptr      pointer to pointer to error text
5037   erroroffset   ptr offset in pattern where error was detected
5038   tables        pointer to character tables or NULL
5039
5040 Returns:        pointer to compiled data block, or NULL on error,
5041                 with errorptr and erroroffset set
5042 */
5043
5044 PCRE_DATA_SCOPE pcre *
5045 pcre_compile(const char *pattern, int options, const char **errorptr,
5046   int *erroroffset, const unsigned char *tables)
5047 {
5048 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5049 }
5050
5051
5052 PCRE_DATA_SCOPE pcre *
5053 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5054   const char **errorptr, int *erroroffset, const unsigned char *tables)
5055 {
5056 real_pcre *re;
5057 int length = 1;  /* For final END opcode */
5058 int firstbyte, reqbyte, newline;
5059 int errorcode = 0;
5060 #ifdef SUPPORT_UTF8
5061 BOOL utf8;
5062 #endif
5063 size_t size;
5064 uschar *code;
5065 const uschar *codestart;
5066 const uschar *ptr;
5067 compile_data compile_block;
5068 compile_data *cd = &compile_block;
5069
5070 /* This space is used for "compiling" into during the first phase, when we are
5071 computing the amount of memory that is needed. Compiled items are thrown away
5072 as soon as possible, so that a fairly large buffer should be sufficient for
5073 this purpose. The same space is used in the second phase for remembering where
5074 to fill in forward references to subpatterns. */
5075
5076 uschar cworkspace[COMPILE_WORK_SIZE];
5077
5078
5079 /* Set this early so that early errors get offset 0. */
5080
5081 ptr = (const uschar *)pattern;
5082
5083 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5084 can do is just return NULL, but we can set a code value if there is a code
5085 pointer. */
5086
5087 if (errorptr == NULL)
5088   {
5089   if (errorcodeptr != NULL) *errorcodeptr = 99;
5090   return NULL;
5091   }
5092
5093 *errorptr = NULL;
5094 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5095
5096 /* However, we can give a message for this error */
5097
5098 if (erroroffset == NULL)
5099   {
5100   errorcode = ERR16;
5101   goto PCRE_EARLY_ERROR_RETURN;
5102   }
5103
5104 *erroroffset = 0;
5105
5106 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5107
5108 #ifdef SUPPORT_UTF8
5109 utf8 = (options & PCRE_UTF8) != 0;
5110 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5111      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5112   {
5113   errorcode = ERR44;
5114   goto PCRE_UTF8_ERROR_RETURN;
5115   }
5116 #else
5117 if ((options & PCRE_UTF8) != 0)
5118   {
5119   errorcode = ERR32;
5120   goto PCRE_EARLY_ERROR_RETURN;
5121   }
5122 #endif
5123
5124 if ((options & ~PUBLIC_OPTIONS) != 0)
5125   {
5126   errorcode = ERR17;
5127   goto PCRE_EARLY_ERROR_RETURN;
5128   }
5129
5130 /* Set up pointers to the individual character tables */
5131
5132 if (tables == NULL) tables = _pcre_default_tables;
5133 cd->lcc = tables + lcc_offset;
5134 cd->fcc = tables + fcc_offset;
5135 cd->cbits = tables + cbits_offset;
5136 cd->ctypes = tables + ctypes_offset;
5137
5138 /* Handle different types of newline. The three bits give seven cases. The
5139 current code allows for fixed one- or two-byte sequences, plus "any". */
5140
5141 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5142   {
5143   case 0: newline = NEWLINE; break;   /* Compile-time default */
5144   case PCRE_NEWLINE_CR: newline = '\r'; break;
5145   case PCRE_NEWLINE_LF: newline = '\n'; break;
5146   case PCRE_NEWLINE_CR+
5147        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5148   case PCRE_NEWLINE_ANY: newline = -1; break;
5149   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5150   }
5151
5152 if (newline < 0)
5153   {
5154   cd->nltype = NLTYPE_ANY;
5155   }
5156 else
5157   {
5158   cd->nltype = NLTYPE_FIXED;
5159   if (newline > 255)
5160     {
5161     cd->nllen = 2;
5162     cd->nl[0] = (newline >> 8) & 255;
5163     cd->nl[1] = newline & 255;
5164     }
5165   else
5166     {
5167     cd->nllen = 1;
5168     cd->nl[0] = newline;
5169     }
5170   }
5171
5172 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5173 references to help in deciding whether (.*) can be treated as anchored or not.
5174 */
5175
5176 cd->top_backref = 0;
5177 cd->backref_map = 0;
5178
5179 /* Reflect pattern for debugging output */
5180
5181 DPRINTF(("------------------------------------------------------------------\n"));
5182 DPRINTF(("%s\n", pattern));
5183
5184 /* Pretend to compile the pattern while actually just accumulating the length
5185 of memory required. This behaviour is triggered by passing a non-NULL final
5186 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5187 to compile parts of the pattern into; the compiled code is discarded when it is
5188 no longer needed, so hopefully this workspace will never overflow, though there
5189 is a test for its doing so. */
5190
5191 cd->bracount = 0;
5192 cd->names_found = 0;
5193 cd->name_entry_size = 0;
5194 cd->name_table = NULL;
5195 cd->start_workspace = cworkspace;
5196 cd->start_code = cworkspace;
5197 cd->hwm = cworkspace;
5198 cd->start_pattern = (const uschar *)pattern;
5199 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5200 cd->req_varyopt = 0;
5201 cd->nopartial = FALSE;
5202 cd->external_options = options;
5203
5204 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5205 don't need to look at the result of the function here. The initial options have
5206 been put into the cd block so that they can be changed if an option setting is
5207 found within the regex right at the beginning. Bringing initial option settings
5208 outside can help speed up starting point checks. */
5209
5210 code = cworkspace;
5211 *code = OP_BRA;
5212 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5213   &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5214 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5215
5216 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5217   cd->hwm - cworkspace));
5218
5219 if (length > MAX_PATTERN_SIZE)
5220   {
5221   errorcode = ERR20;
5222   goto PCRE_EARLY_ERROR_RETURN;
5223   }
5224
5225 /* Compute the size of data block needed and get it, either from malloc or
5226 externally provided function. Integer overflow should no longer be possible
5227 because nowadays we limit the maximum value of cd->names_found and
5228 cd->name_entry_size. */
5229
5230 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5231 re = (real_pcre *)(pcre_malloc)(size);
5232
5233 if (re == NULL)
5234   {
5235   errorcode = ERR21;
5236   goto PCRE_EARLY_ERROR_RETURN;
5237   }
5238
5239 /* Put in the magic number, and save the sizes, initial options, and character
5240 table pointer. NULL is used for the default character tables. The nullpad field
5241 is at the end; it's there to help in the case when a regex compiled on a system
5242 with 4-byte pointers is run on another with 8-byte pointers. */
5243
5244 re->magic_number = MAGIC_NUMBER;
5245 re->size = size;
5246 re->options = cd->external_options;
5247 re->dummy1 = 0;
5248 re->first_byte = 0;
5249 re->req_byte = 0;
5250 re->name_table_offset = sizeof(real_pcre);
5251 re->name_entry_size = cd->name_entry_size;
5252 re->name_count = cd->names_found;
5253 re->ref_count = 0;
5254 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5255 re->nullpad = NULL;
5256
5257 /* The starting points of the name/number translation table and of the code are
5258 passed around in the compile data block. The start/end pattern and initial
5259 options are already set from the pre-compile phase, as is the name_entry_size
5260 field. Reset the bracket count and the names_found field. Also reset the hwm
5261 field; this time it's used for remembering forward references to subpatterns.
5262 */
5263
5264 cd->bracount = 0;
5265 cd->names_found = 0;
5266 cd->name_table = (uschar *)re + re->name_table_offset;
5267 codestart = cd->name_table + re->name_entry_size * re->name_count;
5268 cd->start_code = codestart;
5269 cd->hwm = cworkspace;
5270 cd->req_varyopt = 0;
5271 cd->nopartial = FALSE;
5272
5273 /* Set up a starting, non-extracting bracket, then compile the expression. On
5274 error, errorcode will be set non-zero, so we don't need to look at the result
5275 of the function here. */
5276
5277 ptr = (const uschar *)pattern;
5278 code = (uschar *)codestart;
5279 *code = OP_BRA;
5280 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5281   &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5282 re->top_bracket = cd->bracount;
5283 re->top_backref = cd->top_backref;
5284
5285 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5286
5287 /* If not reached end of pattern on success, there's an excess bracket. */
5288
5289 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5290
5291 /* Fill in the terminating state and check for disastrous overflow, but
5292 if debugging, leave the test till after things are printed out. */
5293
5294 *code++ = OP_END;
5295
5296 #ifndef DEBUG
5297 if (code - codestart > length) errorcode = ERR23;
5298 #endif
5299
5300 /* Fill in any forward references that are required. */
5301
5302 while (errorcode == 0 && cd->hwm > cworkspace)
5303   {
5304   int offset, recno;
5305   const uschar *groupptr;
5306   cd->hwm -= LINK_SIZE;
5307   offset = GET(cd->hwm, 0);
5308   recno = GET(codestart, offset);
5309   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5310   if (groupptr == NULL) errorcode = ERR53;
5311     else PUT(((uschar *)codestart), offset, groupptr - codestart);
5312   }
5313
5314 /* Give an error if there's back reference to a non-existent capturing
5315 subpattern. */
5316
5317 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5318
5319 /* Failed to compile, or error while post-processing */
5320
5321 if (errorcode != 0)
5322   {
5323   (pcre_free)(re);
5324   PCRE_EARLY_ERROR_RETURN:
5325   *erroroffset = ptr - (const uschar *)pattern;
5326 #ifdef SUPPORT_UTF8
5327   PCRE_UTF8_ERROR_RETURN:
5328 #endif
5329   *errorptr = error_texts[errorcode];
5330   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5331   return NULL;
5332   }
5333
5334 /* If the anchored option was not passed, set the flag if we can determine that
5335 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5336 as starting with .* when DOTALL is set).
5337
5338 Otherwise, if we know what the first byte has to be, save it, because that
5339 speeds up unanchored matches no end. If not, see if we can set the
5340 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5341 start with ^. and also when all branches start with .* for non-DOTALL matches.
5342 */
5343
5344 if ((re->options & PCRE_ANCHORED) == 0)
5345   {
5346   int temp_options = re->options;   /* May get changed during these scans */
5347   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5348     re->options |= PCRE_ANCHORED;
5349   else
5350     {
5351     if (firstbyte < 0)
5352       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5353     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
5354       {
5355       int ch = firstbyte & 255;
5356       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5357          cd->fcc[ch] == ch)? ch : firstbyte;
5358       re->options |= PCRE_FIRSTSET;
5359       }
5360     else if (is_startline(codestart, 0, cd->backref_map))
5361       re->options |= PCRE_STARTLINE;
5362     }
5363   }
5364
5365 /* For an anchored pattern, we use the "required byte" only if it follows a
5366 variable length item in the regex. Remove the caseless flag for non-caseable
5367 bytes. */
5368
5369 if (reqbyte >= 0 &&
5370      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5371   {
5372   int ch = reqbyte & 255;
5373   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5374     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5375   re->options |= PCRE_REQCHSET;
5376   }
5377
5378 /* Print out the compiled data if debugging is enabled. This is never the
5379 case when building a production library. */
5380
5381 #ifdef DEBUG
5382
5383 printf("Length = %d top_bracket = %d top_backref = %d\n",
5384   length, re->top_bracket, re->top_backref);
5385
5386 if (re->options != 0)
5387   {
5388   printf("%s%s%s%s%s%s%s%s%s\n",
5389     ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5390     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5391     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5392     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5393     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5394     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5395     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5396     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5397     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5398   }
5399
5400 if ((re->options & PCRE_FIRSTSET) != 0)
5401   {
5402   int ch = re->first_byte & 255;
5403   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5404     "" : " (caseless)";
5405   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5406     else printf("First char = \\x%02x%s\n", ch, caseless);
5407   }
5408
5409 if ((re->options & PCRE_REQCHSET) != 0)
5410   {
5411   int ch = re->req_byte & 255;
5412   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5413     "" : " (caseless)";
5414   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5415     else printf("Req char = \\x%02x%s\n", ch, caseless);
5416   }
5417
5418 pcre_printint(re, stdout);
5419
5420 /* This check is done here in the debugging case so that the code that
5421 was compiled can be seen. */
5422
5423 if (code - codestart > length)
5424   {
5425   (pcre_free)(re);
5426   *errorptr = error_texts[ERR23];
5427   *erroroffset = ptr - (uschar *)pattern;
5428   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5429   return NULL;
5430   }
5431 #endif   /* DEBUG */
5432
5433 return (pcre *)re;
5434 }
5435
5436 /* End of pcre_compile.c */