src/src/pcre/pcre_compile.c

   1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.6 2007/11/12 13:02:19 nm4 Exp $ */
   2
   3 /*************************************************
   4 *      Perl-Compatible Regular Expressions       *
   5 *************************************************/
   6
   7 /* PCRE is a library of functions to support regular expressions whose syntax
   8 and semantics are as close as possible to those of the Perl 5 language.
   9
  10                        Written by Philip Hazel
  11            Copyright (c) 1997-2007 University of Cambridge
  12
  13 -----------------------------------------------------------------------------
  14 Redistribution and use in source and binary forms, with or without
  15 modification, are permitted provided that the following conditions are met:
  16
  17     * Redistributions of source code must retain the above copyright notice,
  18       this list of conditions and the following disclaimer.
  19
  20     * Redistributions in binary form must reproduce the above copyright
  21       notice, this list of conditions and the following disclaimer in the
  22       documentation and/or other materials provided with the distribution.
  23
  24     * Neither the name of the University of Cambridge nor the names of its
  25       contributors may be used to endorse or promote products derived from
  26       this software without specific prior written permission.
  27
  28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38 POSSIBILITY OF SUCH DAMAGE.
  39 -----------------------------------------------------------------------------
  40 */
  41
  42
  43 /* This module contains the external function pcre_compile(), along with
  44 supporting internal functions that are not used by other modules. */
  45
  46
  47 #ifdef HAVE_CONFIG_H
  48 #include "config.h"
  49 #endif
  50
  51 #define NLBLOCK cd             /* Block containing newline information */
  52 #define PSSTART start_pattern  /* Field containing processed string start */
  53 #define PSEND   end_pattern    /* Field containing processed string end */
  54
  55 #include "pcre_internal.h"
  56
  57
  58 /* When DEBUG is defined, we need the pcre_printint() function, which is also
  59 used by pcretest. DEBUG is not defined when building a production library. */
  60
  61 #ifdef DEBUG
  62 #include "pcre_printint.src"
  63 #endif
  64
  65
  66 /* Macro for setting individual bits in class bitmaps. */
  67
  68 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
  69
  70 /* Maximum length value to check against when making sure that the integer that
  71 holds the compiled pattern length does not overflow. We make it a bit less than
  72 INT_MAX to allow for adding in group terminating bytes, so that we don't have
  73 to check them every time. */
  74
  75 #define OFLOW_MAX (INT_MAX - 20)
  76
  77
  78 /*************************************************
  79 *      Code parameters and static tables         *
  80 *************************************************/
  81
  82 /* This value specifies the size of stack workspace that is used during the
  83 first pre-compile phase that determines how much memory is required. The regex
  84 is partly compiled into this space, but the compiled parts are discarded as
  85 soon as they can be, so that hopefully there will never be an overrun. The code
  86 does, however, check for an overrun. The largest amount I've seen used is 218,
  87 so this number is very generous.
  88
  89 The same workspace is used during the second, actual compile phase for
  90 remembering forward references to groups so that they can be filled in at the
  91 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
  92 is 4 there is plenty of room. */
  93
  94 #define COMPILE_WORK_SIZE (4096)
  95
  96
  97 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
  98 are simple data values; negative values are for special things like \d and so
  99 on. Zero means further processing is needed (for things like \x), or the escape
 100 is invalid. */
 101
 102 #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
 103 static const short int escapes[] = {
 104      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 105      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 106    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
 107 -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
 108 -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
 109 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 110    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
 111 -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
 112 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
 113      0,      0, -ESC_z                                            /* x - z */
 114 };
 115
 116 #else           /* This is the "abnormal" table for EBCDIC systems */
 117 static const short int escapes[] = {
 118 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
 119 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
 120 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
 121 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
 122 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
 123 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
 124 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
 125 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
 126 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
 127 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
 128 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
 129 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
 130 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
 131 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
 132 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
 133 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
 134 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
 135 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
 136 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
 137 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
 138 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
 139 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
 140 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
 141 };
 142 #endif
 143
 144
 145 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
 146 searched linearly. Put all the names into a single string, in order to reduce
 147 the number of relocations when a shared library is dynamically linked. */
 148
 149 typedef struct verbitem {
 150   int   len;
 151   int   op;
 152 } verbitem;
 153
 154 static const char verbnames[] =
 155   "ACCEPT\0"
 156   "COMMIT\0"
 157   "F\0"
 158   "FAIL\0"
 159   "PRUNE\0"
 160   "SKIP\0"
 161   "THEN";
 162
 163 static verbitem verbs[] = {
 164   { 6, OP_ACCEPT },
 165   { 6, OP_COMMIT },
 166   { 1, OP_FAIL },
 167   { 4, OP_FAIL },
 168   { 5, OP_PRUNE },
 169   { 4, OP_SKIP  },
 170   { 4, OP_THEN  }
 171 };
 172
 173 static int verbcount = sizeof(verbs)/sizeof(verbitem);
 174
 175
 176 /* Tables of names of POSIX character classes and their lengths. The names are
 177 now all in a single string, to reduce the number of relocations when a shared
 178 library is dynamically loaded. The list of lengths is terminated by a zero
 179 length entry. The first three must be alpha, lower, upper, as this is assumed
 180 for handling case independence. */
 181
 182 static const char posix_names[] =
 183   "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
 184   "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
 185   "word\0"   "xdigit";
 186
 187 static const uschar posix_name_lengths[] = {
 188   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 189
 190 /* Table of class bit maps for each POSIX class. Each class is formed from a
 191 base map, with an optional addition or removal of another map. Then, for some
 192 classes, there is some additional tweaking: for [:blank:] the vertical space
 193 characters are removed, and for [:alpha:] and [:alnum:] the underscore
 194 character is removed. The triples in the table consist of the base map offset,
 195 second map offset or -1 if no second map, and a non-negative value for map
 196 addition or a negative value for map subtraction (if there are two maps). The
 197 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 198 remove vertical space characters, 2 => remove underscore. */
 199
 200 static const int posix_class_maps[] = {
 201   cbit_word,  cbit_digit, -2,             /* alpha */
 202   cbit_lower, -1,          0,             /* lower */
 203   cbit_upper, -1,          0,             /* upper */
 204   cbit_word,  -1,          2,             /* alnum - word without underscore */
 205   cbit_print, cbit_cntrl,  0,             /* ascii */
 206   cbit_space, -1,          1,             /* blank - a GNU extension */
 207   cbit_cntrl, -1,          0,             /* cntrl */
 208   cbit_digit, -1,          0,             /* digit */
 209   cbit_graph, -1,          0,             /* graph */
 210   cbit_print, -1,          0,             /* print */
 211   cbit_punct, -1,          0,             /* punct */
 212   cbit_space, -1,          0,             /* space */
 213   cbit_word,  -1,          0,             /* word - a Perl extension */
 214   cbit_xdigit,-1,          0              /* xdigit */
 215 };
 216
 217
 218 #define STRING(a)  # a
 219 #define XSTRING(s) STRING(s)
 220
 221 /* The texts of compile-time error messages. These are "char *" because they
 222 are passed to the outside world. Do not ever re-use any error number, because
 223 they are documented. Always add a new error instead. Messages marked DEAD below
 224 are no longer used. This used to be a table of strings, but in order to reduce
 225 the number of relocations needed when a shared library is loaded dynamically,
 226 it is now one long string. We cannot use a table of offsets, because the
 227 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
 228 simply count through to the one we want - this isn't a performance issue
 229 because these strings are used only when there is a compilation error. */
 230
 231 static const char error_texts[] =
 232   "no error\0"
 233   "\\ at end of pattern\0"
 234   "\\c at end of pattern\0"
 235   "unrecognized character follows \\\0"
 236   "numbers out of order in {} quantifier\0"
 237   /* 5 */
 238   "number too big in {} quantifier\0"
 239   "missing terminating ] for character class\0"
 240   "invalid escape sequence in character class\0"
 241   "range out of order in character class\0"
 242   "nothing to repeat\0"
 243   /* 10 */
 244   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
 245   "internal error: unexpected repeat\0"
 246   "unrecognized character after (?\0"
 247   "POSIX named classes are supported only within a class\0"
 248   "missing )\0"
 249   /* 15 */
 250   "reference to non-existent subpattern\0"
 251   "erroffset passed as NULL\0"
 252   "unknown option bit(s) set\0"
 253   "missing ) after comment\0"
 254   "parentheses nested too deeply\0"  /** DEAD **/
 255   /* 20 */
 256   "regular expression is too large\0"
 257   "failed to get memory\0"
 258   "unmatched parentheses\0"
 259   "internal error: code overflow\0"
 260   "unrecognized character after (?<\0"
 261   /* 25 */
 262   "lookbehind assertion is not fixed length\0"
 263   "malformed number or name after (?(\0"
 264   "conditional group contains more than two branches\0"
 265   "assertion expected after (?(\0"
 266   "(?R or (?[+-]digits must be followed by )\0"
 267   /* 30 */
 268   "unknown POSIX class name\0"
 269   "POSIX collating elements are not supported\0"
 270   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
 271   "spare error\0"  /** DEAD **/
 272   "character value in \\x{...} sequence is too large\0"
 273   /* 35 */
 274   "invalid condition (?(0)\0"
 275   "\\C not allowed in lookbehind assertion\0"
 276   "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
 277   "number after (?C is > 255\0"
 278   "closing ) for (?C expected\0"
 279   /* 40 */
 280   "recursive call could loop indefinitely\0"
 281   "unrecognized character after (?P\0"
 282   "syntax error in subpattern name (missing terminator)\0"
 283   "two named subpatterns have the same name\0"
 284   "invalid UTF-8 string\0"
 285   /* 45 */
 286   "support for \\P, \\p, and \\X has not been compiled\0"
 287   "malformed \\P or \\p sequence\0"
 288   "unknown property name after \\P or \\p\0"
 289   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
 290   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
 291   /* 50 */
 292   "repeated subpattern is too long\0"    /** DEAD **/
 293   "octal value is greater than \\377 (not in UTF-8 mode)\0"
 294   "internal error: overran compiling workspace\0"
 295   "internal error: previously-checked referenced subpattern not found\0"
 296   "DEFINE group contains more than one branch\0"
 297   /* 55 */
 298   "repeating a DEFINE group is not allowed\0"
 299   "inconsistent NEWLINE options\0"
 300   "\\g is not followed by a braced name or an optionally braced non-zero number\0"
 301   "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
 302   "(*VERB) with an argument is not supported\0"
 303   /* 60 */
 304   "(*VERB) not recognized\0"
 305   "number is too big";
 306
 307
 308 /* Table to identify digits and hex digits. This is used when compiling
 309 patterns. Note that the tables in chartables are dependent on the locale, and
 310 may mark arbitrary characters as digits - but the PCRE compiling code expects
 311 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
 312 a private table here. It costs 256 bytes, but it is a lot faster than doing
 313 character value tests (at least in some simple cases I timed), and in some
 314 applications one wants PCRE to compile efficiently as well as match
 315 efficiently.
 316
 317 For convenience, we use the same bit definitions as in chartables:
 318
 319   0x04   decimal digit
 320   0x08   hexadecimal digit
 321
 322 Then we can use ctype_digit and ctype_xdigit in the code. */
 323
 324 #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
 325 static const unsigned char digitab[] =
 326   {
 327   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
 328   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
 329   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
 330   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 331   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
 332   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
 333   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
 334   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
 335   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
 336   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
 337   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
 338   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
 339   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
 340   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
 341   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
 342   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
 343   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
 344   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
 345   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
 346   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
 347   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
 348   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
 349   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
 350   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 351   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
 352   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
 353   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
 354   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
 355   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
 356   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
 357   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
 358   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
 359
 360 #else           /* This is the "abnormal" case, for EBCDIC systems */
 361 static const unsigned char digitab[] =
 362   {
 363   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
 364   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
 365   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
 366   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
 367   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
 368   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
 369   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
 370   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
 371   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
 372   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
 373   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
 374   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
 375   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
 376   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
 377   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
 378   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
 379   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
 380   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
 381   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
 382   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
 383   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
 384   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
 385   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
 386   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
 387   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
 388   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
 389   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
 390   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
 391   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
 392   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
 393   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
 394   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
 395
 396 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
 397   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
 398   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
 399   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
 400   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
 401   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
 402   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
 403   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
 404   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
 405   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
 406   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
 407   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
 408   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
 409   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
 410   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
 411   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
 412   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
 413   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
 414   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
 415   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
 416   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
 417   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
 418   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
 419   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
 420   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
 421   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
 422   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
 423   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
 424   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
 425   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
 426   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
 427   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
 428   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
 429 #endif
 430
 431
 432 /* Definition to allow mutual recursion */
 433
 434 static BOOL
 435   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
 436     int *, int *, branch_chain *, compile_data *, int *);
 437
 438
 439
 440 /*************************************************
 441 *            Find an error text                  *
 442 *************************************************/
 443
 444 /* The error texts are now all in one long string, to save on relocations. As
 445 some of the text is of unknown length, we can't use a table of offsets.
 446 Instead, just count through the strings. This is not a performance issue
 447 because it happens only when there has been a compilation error.
 448
 449 Argument:   the error number
 450 Returns:    pointer to the error string
 451 */
 452
 453 static const char *
 454 find_error_text(int n)
 455 {
 456 const char *s = error_texts;
 457 for (; n > 0; n--) while (*s++ != 0);
 458 return s;
 459 }
 460
 461
 462 /*************************************************
 463 *            Handle escapes                      *
 464 *************************************************/
 465
 466 /* This function is called when a \ has been encountered. It either returns a
 467 positive value for a simple escape such as \n, or a negative value which
 468 encodes one of the more complicated things such as \d. A backreference to group
 469 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
 470 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
 471 ptr is pointing at the \. On exit, it is on the final character of the escape
 472 sequence.
 473
 474 Arguments:
 475   ptrptr         points to the pattern position pointer
 476   errorcodeptr   points to the errorcode variable
 477   bracount       number of previous extracting brackets
 478   options        the options bits
 479   isclass        TRUE if inside a character class
 480
 481 Returns:         zero or positive => a data character
 482                  negative => a special escape sequence
 483                  on error, errorcodeptr is set
 484 */
 485
 486 static int
 487 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
 488   int options, BOOL isclass)
 489 {
 490 BOOL utf8 = (options & PCRE_UTF8) != 0;
 491 const uschar *ptr = *ptrptr + 1;
 492 int c, i;
 493
 494 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
 495 ptr--;                            /* Set pointer back to the last byte */
 496
 497 /* If backslash is at the end of the pattern, it's an error. */
 498
 499 if (c == 0) *errorcodeptr = ERR1;
 500
 501 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
 502 a table. A non-zero result is something that can be returned immediately.
 503 Otherwise further processing may be required. */
 504
 505 #ifndef EBCDIC  /* ASCII coding */
 506 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
 507 else if ((i = escapes[c - '0']) != 0) c = i;
 508
 509 #else           /* EBCDIC coding */
 510 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
 511 else if ((i = escapes[c - 0x48]) != 0)  c = i;
 512 #endif
 513
 514 /* Escapes that need further processing, or are illegal. */
 515
 516 else
 517   {
 518   const uschar *oldptr;
 519   BOOL braced, negated;
 520
 521   switch (c)
 522     {
 523     /* A number of Perl escapes are not handled by PCRE. We give an explicit
 524     error. */
 525
 526     case 'l':
 527     case 'L':
 528     case 'N':
 529     case 'u':
 530     case 'U':
 531     *errorcodeptr = ERR37;
 532     break;
 533
 534     /* \g must be followed by a number, either plain or braced. If positive, it
 535     is an absolute backreference. If negative, it is a relative backreference.
 536     This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
 537     reference to a named group. This is part of Perl's movement towards a
 538     unified syntax for back references. As this is synonymous with \k{name}, we
 539     fudge it up by pretending it really was \k. */
 540
 541     case 'g':
 542     if (ptr[1] == '{')
 543       {
 544       const uschar *p;
 545       for (p = ptr+2; *p != 0 && *p != '}'; p++)
 546         if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
 547       if (*p != 0 && *p != '}')
 548         {
 549         c = -ESC_k;
 550         break;
 551         }
 552       braced = TRUE;
 553       ptr++;
 554       }
 555     else braced = FALSE;
 556
 557     if (ptr[1] == '-')
 558       {
 559       negated = TRUE;
 560       ptr++;
 561       }
 562     else negated = FALSE;
 563
 564     c = 0;
 565     while ((digitab[ptr[1]] & ctype_digit) != 0)
 566       c = c * 10 + *(++ptr) - '0';
 567
 568     if (c < 0)
 569       {
 570       *errorcodeptr = ERR61;
 571       break;
 572       }
 573
 574     if (c == 0 || (braced && *(++ptr) != '}'))
 575       {
 576       *errorcodeptr = ERR57;
 577       break;
 578       }
 579
 580     if (negated)
 581       {
 582       if (c > bracount)
 583         {
 584         *errorcodeptr = ERR15;
 585         break;
 586         }
 587       c = bracount - (c - 1);
 588       }
 589
 590     c = -(ESC_REF + c);
 591     break;
 592
 593     /* The handling of escape sequences consisting of a string of digits
 594     starting with one that is not zero is not straightforward. By experiment,
 595     the way Perl works seems to be as follows:
 596
 597     Outside a character class, the digits are read as a decimal number. If the
 598     number is less than 10, or if there are that many previous extracting
 599     left brackets, then it is a back reference. Otherwise, up to three octal
 600     digits are read to form an escaped byte. Thus \123 is likely to be octal
 601     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 602     value is greater than 377, the least significant 8 bits are taken. Inside a
 603     character class, \ followed by a digit is always an octal number. */
 604
 605     case '1': case '2': case '3': case '4': case '5':
 606     case '6': case '7': case '8': case '9':
 607
 608     if (!isclass)
 609       {
 610       oldptr = ptr;
 611       c -= '0';
 612       while ((digitab[ptr[1]] & ctype_digit) != 0)
 613         c = c * 10 + *(++ptr) - '0';
 614       if (c < 0)
 615         {
 616         *errorcodeptr = ERR61;
 617         break;
 618         }
 619       if (c < 10 || c <= bracount)
 620         {
 621         c = -(ESC_REF + c);
 622         break;
 623         }
 624       ptr = oldptr;      /* Put the pointer back and fall through */
 625       }
 626
 627     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 628     generates a binary zero byte and treats the digit as a following literal.
 629     Thus we have to pull back the pointer by one. */
 630
 631     if ((c = *ptr) >= '8')
 632       {
 633       ptr--;
 634       c = 0;
 635       break;
 636       }
 637
 638     /* \0 always starts an octal number, but we may drop through to here with a
 639     larger first octal digit. The original code used just to take the least
 640     significant 8 bits of octal numbers (I think this is what early Perls used
 641     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
 642     than 3 octal digits. */
 643
 644     case '0':
 645     c -= '0';
 646     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
 647         c = c * 8 + *(++ptr) - '0';
 648     if (!utf8 && c > 255) *errorcodeptr = ERR51;
 649     break;
 650
 651     /* \x is complicated. \x{ddd} is a character number which can be greater
 652     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
 653     treated as a data character. */
 654
 655     case 'x':
 656     if (ptr[1] == '{')
 657       {
 658       const uschar *pt = ptr + 2;
 659       int count = 0;
 660
 661       c = 0;
 662       while ((digitab[*pt] & ctype_xdigit) != 0)
 663         {
 664         register int cc = *pt++;
 665         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
 666         count++;
 667
 668 #ifndef EBCDIC  /* ASCII coding */
 669         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
 670         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
 671 #else           /* EBCDIC coding */
 672         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
 673         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
 674 #endif
 675         }
 676
 677       if (*pt == '}')
 678         {
 679         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
 680         ptr = pt;
 681         break;
 682         }
 683
 684       /* If the sequence of hex digits does not end with '}', then we don't
 685       recognize this construct; fall through to the normal \x handling. */
 686       }
 687
 688     /* Read just a single-byte hex-defined char */
 689
 690     c = 0;
 691     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
 692       {
 693       int cc;                               /* Some compilers don't like ++ */
 694       cc = *(++ptr);                        /* in initializers */
 695 #ifndef EBCDIC  /* ASCII coding */
 696       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
 697       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
 698 #else           /* EBCDIC coding */
 699       if (cc <= 'z') cc += 64;              /* Convert to upper case */
 700       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
 701 #endif
 702       }
 703     break;
 704
 705     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
 706     This coding is ASCII-specific, but then the whole concept of \cx is
 707     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
 708
 709     case 'c':
 710     c = *(++ptr);
 711     if (c == 0)
 712       {
 713       *errorcodeptr = ERR2;
 714       break;
 715       }
 716
 717 #ifndef EBCDIC  /* ASCII coding */
 718     if (c >= 'a' && c <= 'z') c -= 32;
 719     c ^= 0x40;
 720 #else           /* EBCDIC coding */
 721     if (c >= 'a' && c <= 'z') c += 64;
 722     c ^= 0xC0;
 723 #endif
 724     break;
 725
 726     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 727     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
 728     for Perl compatibility, it is a literal. This code looks a bit odd, but
 729     there used to be some cases other than the default, and there may be again
 730     in future, so I haven't "optimized" it. */
 731
 732     default:
 733     if ((options & PCRE_EXTRA) != 0) switch(c)
 734       {
 735       default:
 736       *errorcodeptr = ERR3;
 737       break;
 738       }
 739     break;
 740     }
 741   }
 742
 743 *ptrptr = ptr;
 744 return c;
 745 }
 746
 747
 748
 749 #ifdef SUPPORT_UCP
 750 /*************************************************
 751 *               Handle \P and \p                 *
 752 *************************************************/
 753
 754 /* This function is called after \P or \p has been encountered, provided that
 755 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
 756 pointing at the P or p. On exit, it is pointing at the final character of the
 757 escape sequence.
 758
 759 Argument:
 760   ptrptr         points to the pattern position pointer
 761   negptr         points to a boolean that is set TRUE for negation else FALSE
 762   dptr           points to an int that is set to the detailed property value
 763   errorcodeptr   points to the error code variable
 764
 765 Returns:         type value from ucp_type_table, or -1 for an invalid type
 766 */
 767
 768 static int
 769 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
 770 {
 771 int c, i, bot, top;
 772 const uschar *ptr = *ptrptr;
 773 char name[32];
 774
 775 c = *(++ptr);
 776 if (c == 0) goto ERROR_RETURN;
 777
 778 *negptr = FALSE;
 779
 780 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
 781 negation. */
 782
 783 if (c == '{')
 784   {
 785   if (ptr[1] == '^')
 786     {
 787     *negptr = TRUE;
 788     ptr++;
 789     }
 790   for (i = 0; i < (int)sizeof(name) - 1; i++)
 791     {
 792     c = *(++ptr);
 793     if (c == 0) goto ERROR_RETURN;
 794     if (c == '}') break;
 795     name[i] = c;
 796     }
 797   if (c !='}') goto ERROR_RETURN;
 798   name[i] = 0;
 799   }
 800
 801 /* Otherwise there is just one following character */
 802
 803 else
 804   {
 805   name[0] = c;
 806   name[1] = 0;
 807   }
 808
 809 *ptrptr = ptr;
 810
 811 /* Search for a recognized property name using binary chop */
 812
 813 bot = 0;
 814 top = _pcre_utt_size;
 815
 816 while (bot < top)
 817   {
 818   i = (bot + top) >> 1;
 819   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
 820   if (c == 0)
 821     {
 822     *dptr = _pcre_utt[i].value;
 823     return _pcre_utt[i].type;
 824     }
 825   if (c > 0) bot = i + 1; else top = i;
 826   }
 827
 828 *errorcodeptr = ERR47;
 829 *ptrptr = ptr;
 830 return -1;
 831
 832 ERROR_RETURN:
 833 *errorcodeptr = ERR46;
 834 *ptrptr = ptr;
 835 return -1;
 836 }
 837 #endif
 838
 839
 840
 841
 842 /*************************************************
 843 *            Check for counted repeat            *
 844 *************************************************/
 845
 846 /* This function is called when a '{' is encountered in a place where it might
 847 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 848 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 849 where the ddds are digits.
 850
 851 Arguments:
 852   p         pointer to the first char after '{'
 853
 854 Returns:    TRUE or FALSE
 855 */
 856
 857 static BOOL
 858 is_counted_repeat(const uschar *p)
 859 {
 860 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 861 while ((digitab[*p] & ctype_digit) != 0) p++;
 862 if (*p == '}') return TRUE;
 863
 864 if (*p++ != ',') return FALSE;
 865 if (*p == '}') return TRUE;
 866
 867 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
 868 while ((digitab[*p] & ctype_digit) != 0) p++;
 869
 870 return (*p == '}');
 871 }
 872
 873
 874
 875 /*************************************************
 876 *         Read repeat counts                     *
 877 *************************************************/
 878
 879 /* Read an item of the form {n,m} and return the values. This is called only
 880 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 881 so the syntax is guaranteed to be correct, but we need to check the values.
 882
 883 Arguments:
 884   p              pointer to first char after '{'
 885   minp           pointer to int for min
 886   maxp           pointer to int for max
 887                  returned as -1 if no max
 888   errorcodeptr   points to error code variable
 889
 890 Returns:         pointer to '}' on success;
 891                  current ptr on error, with errorcodeptr set non-zero
 892 */
 893
 894 static const uschar *
 895 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
 896 {
 897 int min = 0;
 898 int max = -1;
 899
 900 /* Read the minimum value and do a paranoid check: a negative value indicates
 901 an integer overflow. */
 902
 903 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
 904 if (min < 0 || min > 65535)
 905   {
 906   *errorcodeptr = ERR5;
 907   return p;
 908   }
 909
 910 /* Read the maximum value if there is one, and again do a paranoid on its size.
 911 Also, max must not be less than min. */
 912
 913 if (*p == '}') max = min; else
 914   {
 915   if (*(++p) != '}')
 916     {
 917     max = 0;
 918     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
 919     if (max < 0 || max > 65535)
 920       {
 921       *errorcodeptr = ERR5;
 922       return p;
 923       }
 924     if (max < min)
 925       {
 926       *errorcodeptr = ERR4;
 927       return p;
 928       }
 929     }
 930   }
 931
 932 /* Fill in the required variables, and pass back the pointer to the terminating
 933 '}'. */
 934
 935 *minp = min;
 936 *maxp = max;
 937 return p;
 938 }
 939
 940
 941
 942 /*************************************************
 943 *       Find forward referenced subpattern       *
 944 *************************************************/
 945
 946 /* This function scans along a pattern's text looking for capturing
 947 subpatterns, and counting them. If it finds a named pattern that matches the
 948 name it is given, it returns its number. Alternatively, if the name is NULL, it
 949 returns when it reaches a given numbered subpattern. This is used for forward
 950 references to subpatterns. We know that if (?P< is encountered, the name will
 951 be terminated by '>' because that is checked in the first pass.
 952
 953 Arguments:
 954   ptr          current position in the pattern
 955   count        current count of capturing parens so far encountered
 956   name         name to seek, or NULL if seeking a numbered subpattern
 957   lorn         name length, or subpattern number if name is NULL
 958   xmode        TRUE if we are in /x mode
 959
 960 Returns:       the number of the named subpattern, or -1 if not found
 961 */
 962
 963 static int
 964 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
 965   BOOL xmode)
 966 {
 967 const uschar *thisname;
 968
 969 for (; *ptr != 0; ptr++)
 970   {
 971   int term;
 972
 973   /* Skip over backslashed characters and also entire \Q...\E */
 974
 975   if (*ptr == '\\')
 976     {
 977     if (*(++ptr) == 0) return -1;
 978     if (*ptr == 'Q') for (;;)
 979       {
 980       while (*(++ptr) != 0 && *ptr != '\\');
 981       if (*ptr == 0) return -1;
 982       if (*(++ptr) == 'E') break;
 983       }
 984     continue;
 985     }
 986
 987   /* Skip over character classes */
 988
 989   if (*ptr == '[')
 990     {
 991     while (*(++ptr) != ']')
 992       {
 993       if (*ptr == 0) return -1;
 994       if (*ptr == '\\')
 995         {
 996         if (*(++ptr) == 0) return -1;
 997         if (*ptr == 'Q') for (;;)
 998           {
 999           while (*(++ptr) != 0 && *ptr != '\\');
1000           if (*ptr == 0) return -1;
1001           if (*(++ptr) == 'E') break;
1002           }
1003         continue;
1004         }
1005       }
1006     continue;
1007     }
1008
1009   /* Skip comments in /x mode */
1010
1011   if (xmode && *ptr == '#')
1012     {
1013     while (*(++ptr) != 0 && *ptr != '\n');
1014     if (*ptr == 0) return -1;
1015     continue;
1016     }
1017
1018   /* An opening parens must now be a real metacharacter */
1019
1020   if (*ptr != '(') continue;
1021   if (ptr[1] != '?' && ptr[1] != '*')
1022     {
1023     count++;
1024     if (name == NULL && count == lorn) return count;
1025     continue;
1026     }
1027
1028   ptr += 2;
1029   if (*ptr == 'P') ptr++;                      /* Allow optional P */
1030
1031   /* We have to disambiguate (?<! and (?<= from (?<name> */
1032
1033   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1034        *ptr != '\'')
1035     continue;
1036
1037   count++;
1038
1039   if (name == NULL && count == lorn) return count;
1040   term = *ptr++;
1041   if (term == '<') term = '>';
1042   thisname = ptr;
1043   while (*ptr != term) ptr++;
1044   if (name != NULL && lorn == ptr - thisname &&
1045       strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1046     return count;
1047   }
1048
1049 return -1;
1050 }
1051
1052
1053
1054 /*************************************************
1055 *      Find first significant op code            *
1056 *************************************************/
1057
1058 /* This is called by several functions that scan a compiled expression looking
1059 for a fixed first character, or an anchoring op code etc. It skips over things
1060 that do not influence this. For some calls, a change of option is important.
1061 For some calls, it makes sense to skip negative forward and all backward
1062 assertions, and also the \b assertion; for others it does not.
1063
1064 Arguments:
1065   code         pointer to the start of the group
1066   options      pointer to external options
1067   optbit       the option bit whose changing is significant, or
1068                  zero if none are
1069   skipassert   TRUE if certain assertions are to be skipped
1070
1071 Returns:       pointer to the first significant opcode
1072 */
1073
1074 static const uschar*
1075 first_significant_code(const uschar *code, int *options, int optbit,
1076   BOOL skipassert)
1077 {
1078 for (;;)
1079   {
1080   switch ((int)*code)
1081     {
1082     case OP_OPT:
1083     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084       *options = (int)code[1];
1085     code += 2;
1086     break;
1087
1088     case OP_ASSERT_NOT:
1089     case OP_ASSERTBACK:
1090     case OP_ASSERTBACK_NOT:
1091     if (!skipassert) return code;
1092     do code += GET(code, 1); while (*code == OP_ALT);
1093     code += _pcre_OP_lengths[*code];
1094     break;
1095
1096     case OP_WORD_BOUNDARY:
1097     case OP_NOT_WORD_BOUNDARY:
1098     if (!skipassert) return code;
1099     /* Fall through */
1100
1101     case OP_CALLOUT:
1102     case OP_CREF:
1103     case OP_RREF:
1104     case OP_DEF:
1105     code += _pcre_OP_lengths[*code];
1106     break;
1107
1108     default:
1109     return code;
1110     }
1111   }
1112 /* Control never reaches here */
1113 }
1114
1115
1116
1117
1118 /*************************************************
1119 *        Find the fixed length of a pattern      *
1120 *************************************************/
1121
1122 /* Scan a pattern and compute the fixed length of subject that will match it,
1123 if the length is fixed. This is needed for dealing with backward assertions.
1124 In UTF8 mode, the result is in characters rather than bytes.
1125
1126 Arguments:
1127   code     points to the start of the pattern (the bracket)
1128   options  the compiling options
1129
1130 Returns:   the fixed length, or -1 if there is no fixed length,
1131              or -2 if \C was encountered
1132 */
1133
1134 static int
1135 find_fixedlength(uschar *code, int options)
1136 {
1137 int length = -1;
1138
1139 register int branchlength = 0;
1140 register uschar *cc = code + 1 + LINK_SIZE;
1141
1142 /* Scan along the opcodes for this branch. If we get to the end of the
1143 branch, check the length against that of the other branches. */
1144
1145 for (;;)
1146   {
1147   int d;
1148   register int op = *cc;
1149   switch (op)
1150     {
1151     case OP_CBRA:
1152     case OP_BRA:
1153     case OP_ONCE:
1154     case OP_COND:
1155     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156     if (d < 0) return d;
1157     branchlength += d;
1158     do cc += GET(cc, 1); while (*cc == OP_ALT);
1159     cc += 1 + LINK_SIZE;
1160     break;
1161
1162     /* Reached end of a branch; if it's a ket it is the end of a nested
1163     call. If it's ALT it is an alternation in a nested call. If it is
1164     END it's the end of the outer call. All can be handled by the same code. */
1165
1166     case OP_ALT:
1167     case OP_KET:
1168     case OP_KETRMAX:
1169     case OP_KETRMIN:
1170     case OP_END:
1171     if (length < 0) length = branchlength;
1172       else if (length != branchlength) return -1;
1173     if (*cc != OP_ALT) return length;
1174     cc += 1 + LINK_SIZE;
1175     branchlength = 0;
1176     break;
1177
1178     /* Skip over assertive subpatterns */
1179
1180     case OP_ASSERT:
1181     case OP_ASSERT_NOT:
1182     case OP_ASSERTBACK:
1183     case OP_ASSERTBACK_NOT:
1184     do cc += GET(cc, 1); while (*cc == OP_ALT);
1185     /* Fall through */
1186
1187     /* Skip over things that don't match chars */
1188
1189     case OP_REVERSE:
1190     case OP_CREF:
1191     case OP_RREF:
1192     case OP_DEF:
1193     case OP_OPT:
1194     case OP_CALLOUT:
1195     case OP_SOD:
1196     case OP_SOM:
1197     case OP_EOD:
1198     case OP_EODN:
1199     case OP_CIRC:
1200     case OP_DOLL:
1201     case OP_NOT_WORD_BOUNDARY:
1202     case OP_WORD_BOUNDARY:
1203     cc += _pcre_OP_lengths[*cc];
1204     break;
1205
1206     /* Handle literal characters */
1207
1208     case OP_CHAR:
1209     case OP_CHARNC:
1210     case OP_NOT:
1211     branchlength++;
1212     cc += 2;
1213 #ifdef SUPPORT_UTF8
1214     if ((options & PCRE_UTF8) != 0)
1215       {
1216       while ((*cc & 0xc0) == 0x80) cc++;
1217       }
1218 #endif
1219     break;
1220
1221     /* Handle exact repetitions. The count is already in characters, but we
1222     need to skip over a multibyte character in UTF8 mode.  */
1223
1224     case OP_EXACT:
1225     branchlength += GET2(cc,1);
1226     cc += 4;
1227 #ifdef SUPPORT_UTF8
1228     if ((options & PCRE_UTF8) != 0)
1229       {
1230       while((*cc & 0x80) == 0x80) cc++;
1231       }
1232 #endif
1233     break;
1234
1235     case OP_TYPEEXACT:
1236     branchlength += GET2(cc,1);
1237     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1238     cc += 4;
1239     break;
1240
1241     /* Handle single-char matchers */
1242
1243     case OP_PROP:
1244     case OP_NOTPROP:
1245     cc += 2;
1246     /* Fall through */
1247
1248     case OP_NOT_DIGIT:
1249     case OP_DIGIT:
1250     case OP_NOT_WHITESPACE:
1251     case OP_WHITESPACE:
1252     case OP_NOT_WORDCHAR:
1253     case OP_WORDCHAR:
1254     case OP_ANY:
1255     branchlength++;
1256     cc++;
1257     break;
1258
1259     /* The single-byte matcher isn't allowed */
1260
1261     case OP_ANYBYTE:
1262     return -2;
1263
1264     /* Check a class for variable quantification */
1265
1266 #ifdef SUPPORT_UTF8
1267     case OP_XCLASS:
1268     cc += GET(cc, 1) - 33;
1269     /* Fall through */
1270 #endif
1271
1272     case OP_CLASS:
1273     case OP_NCLASS:
1274     cc += 33;
1275
1276     switch (*cc)
1277       {
1278       case OP_CRSTAR:
1279       case OP_CRMINSTAR:
1280       case OP_CRQUERY:
1281       case OP_CRMINQUERY:
1282       return -1;
1283
1284       case OP_CRRANGE:
1285       case OP_CRMINRANGE:
1286       if (GET2(cc,1) != GET2(cc,3)) return -1;
1287       branchlength += GET2(cc,1);
1288       cc += 5;
1289       break;
1290
1291       default:
1292       branchlength++;
1293       }
1294     break;
1295
1296     /* Anything else is variable length */
1297
1298     default:
1299     return -1;
1300     }
1301   }
1302 /* Control never gets here */
1303 }
1304
1305
1306
1307
1308 /*************************************************
1309 *    Scan compiled regex for numbered bracket    *
1310 *************************************************/
1311
1312 /* This little function scans through a compiled pattern until it finds a
1313 capturing bracket with the given number.
1314
1315 Arguments:
1316   code        points to start of expression
1317   utf8        TRUE in UTF-8 mode
1318   number      the required bracket number
1319
1320 Returns:      pointer to the opcode for the bracket, or NULL if not found
1321 */
1322
1323 static const uschar *
1324 find_bracket(const uschar *code, BOOL utf8, int number)
1325 {
1326 for (;;)
1327   {
1328   register int c = *code;
1329   if (c == OP_END) return NULL;
1330
1331   /* XCLASS is used for classes that cannot be represented just by a bit
1332   map. This includes negated single high-valued characters. The length in
1333   the table is zero; the actual length is stored in the compiled code. */
1334
1335   if (c == OP_XCLASS) code += GET(code, 1);
1336
1337   /* Handle capturing bracket */
1338
1339   else if (c == OP_CBRA)
1340     {
1341     int n = GET2(code, 1+LINK_SIZE);
1342     if (n == number) return (uschar *)code;
1343     code += _pcre_OP_lengths[c];
1344     }
1345
1346   /* Otherwise, we can get the item's length from the table, except that for
1347   repeated character types, we have to test for \p and \P, which have an extra
1348   two bytes of parameters. */
1349
1350   else
1351     {
1352     switch(c)
1353       {
1354       case OP_TYPESTAR:
1355       case OP_TYPEMINSTAR:
1356       case OP_TYPEPLUS:
1357       case OP_TYPEMINPLUS:
1358       case OP_TYPEQUERY:
1359       case OP_TYPEMINQUERY:
1360       case OP_TYPEPOSSTAR:
1361       case OP_TYPEPOSPLUS:
1362       case OP_TYPEPOSQUERY:
1363       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1364       break;
1365
1366       case OP_TYPEUPTO:
1367       case OP_TYPEMINUPTO:
1368       case OP_TYPEEXACT:
1369       case OP_TYPEPOSUPTO:
1370       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1371       break;
1372       }
1373
1374     /* Add in the fixed length from the table */
1375
1376     code += _pcre_OP_lengths[c];
1377
1378   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379   a multi-byte character. The length in the table is a minimum, so we have to
1380   arrange to skip the extra bytes. */
1381
1382 #ifdef SUPPORT_UTF8
1383     if (utf8) switch(c)
1384       {
1385       case OP_CHAR:
1386       case OP_CHARNC:
1387       case OP_EXACT:
1388       case OP_UPTO:
1389       case OP_MINUPTO:
1390       case OP_POSUPTO:
1391       case OP_STAR:
1392       case OP_MINSTAR:
1393       case OP_POSSTAR:
1394       case OP_PLUS:
1395       case OP_MINPLUS:
1396       case OP_POSPLUS:
1397       case OP_QUERY:
1398       case OP_MINQUERY:
1399       case OP_POSQUERY:
1400       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1401       break;
1402       }
1403 #endif
1404     }
1405   }
1406 }
1407
1408
1409
1410 /*************************************************
1411 *   Scan compiled regex for recursion reference  *
1412 *************************************************/
1413
1414 /* This little function scans through a compiled pattern until it finds an
1415 instance of OP_RECURSE.
1416
1417 Arguments:
1418   code        points to start of expression
1419   utf8        TRUE in UTF-8 mode
1420
1421 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1422 */
1423
1424 static const uschar *
1425 find_recurse(const uschar *code, BOOL utf8)
1426 {
1427 for (;;)
1428   {
1429   register int c = *code;
1430   if (c == OP_END) return NULL;
1431   if (c == OP_RECURSE) return code;
1432
1433   /* XCLASS is used for classes that cannot be represented just by a bit
1434   map. This includes negated single high-valued characters. The length in
1435   the table is zero; the actual length is stored in the compiled code. */
1436
1437   if (c == OP_XCLASS) code += GET(code, 1);
1438
1439   /* Otherwise, we can get the item's length from the table, except that for
1440   repeated character types, we have to test for \p and \P, which have an extra
1441   two bytes of parameters. */
1442
1443   else
1444     {
1445     switch(c)
1446       {
1447       case OP_TYPESTAR:
1448       case OP_TYPEMINSTAR:
1449       case OP_TYPEPLUS:
1450       case OP_TYPEMINPLUS:
1451       case OP_TYPEQUERY:
1452       case OP_TYPEMINQUERY:
1453       case OP_TYPEPOSSTAR:
1454       case OP_TYPEPOSPLUS:
1455       case OP_TYPEPOSQUERY:
1456       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1457       break;
1458
1459       case OP_TYPEPOSUPTO:
1460       case OP_TYPEUPTO:
1461       case OP_TYPEMINUPTO:
1462       case OP_TYPEEXACT:
1463       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1464       break;
1465       }
1466
1467     /* Add in the fixed length from the table */
1468
1469     code += _pcre_OP_lengths[c];
1470
1471     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472     by a multi-byte character. The length in the table is a minimum, so we have
1473     to arrange to skip the extra bytes. */
1474
1475 #ifdef SUPPORT_UTF8
1476     if (utf8) switch(c)
1477       {
1478       case OP_CHAR:
1479       case OP_CHARNC:
1480       case OP_EXACT:
1481       case OP_UPTO:
1482       case OP_MINUPTO:
1483       case OP_POSUPTO:
1484       case OP_STAR:
1485       case OP_MINSTAR:
1486       case OP_POSSTAR:
1487       case OP_PLUS:
1488       case OP_MINPLUS:
1489       case OP_POSPLUS:
1490       case OP_QUERY:
1491       case OP_MINQUERY:
1492       case OP_POSQUERY:
1493       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1494       break;
1495       }
1496 #endif
1497     }
1498   }
1499 }
1500
1501
1502
1503 /*************************************************
1504 *    Scan compiled branch for non-emptiness      *
1505 *************************************************/
1506
1507 /* This function scans through a branch of a compiled pattern to see whether it
1508 can match the empty string or not. It is called from could_be_empty()
1509 below and from compile_branch() when checking for an unlimited repeat of a
1510 group that can match nothing. Note that first_significant_code() skips over
1511 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1512 struck an inner bracket whose current branch will already have been scanned.
1513
1514 Arguments:
1515   code        points to start of search
1516   endcode     points to where to stop
1517   utf8        TRUE if in UTF8 mode
1518
1519 Returns:      TRUE if what is matched could be empty
1520 */
1521
1522 static BOOL
1523 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1524 {
1525 register int c;
1526 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1527      code < endcode;
1528      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1529   {
1530   const uschar *ccode;
1531
1532   c = *code;
1533
1534   /* Groups with zero repeats can of course be empty; skip them. */
1535
1536   if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1537     {
1538     code += _pcre_OP_lengths[c];
1539     do code += GET(code, 1); while (*code == OP_ALT);
1540     c = *code;
1541     continue;
1542     }
1543
1544   /* For other groups, scan the branches. */
1545
1546   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1547     {
1548     BOOL empty_branch;
1549     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1550
1551     /* Scan a closed bracket */
1552
1553     empty_branch = FALSE;
1554     do
1555       {
1556       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1557         empty_branch = TRUE;
1558       code += GET(code, 1);
1559       }
1560     while (*code == OP_ALT);
1561     if (!empty_branch) return FALSE;   /* All branches are non-empty */
1562     c = *code;
1563     continue;
1564     }
1565
1566   /* Handle the other opcodes */
1567
1568   switch (c)
1569     {
1570     /* Check for quantifiers after a class. XCLASS is used for classes that
1571     cannot be represented just by a bit map. This includes negated single
1572     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1573     actual length is stored in the compiled code, so we must update "code"
1574     here. */
1575
1576 #ifdef SUPPORT_UTF8
1577     case OP_XCLASS:
1578     ccode = code += GET(code, 1);
1579     goto CHECK_CLASS_REPEAT;
1580 #endif
1581
1582     case OP_CLASS:
1583     case OP_NCLASS:
1584     ccode = code + 33;
1585
1586 #ifdef SUPPORT_UTF8
1587     CHECK_CLASS_REPEAT:
1588 #endif
1589
1590     switch (*ccode)
1591       {
1592       case OP_CRSTAR:            /* These could be empty; continue */
1593       case OP_CRMINSTAR:
1594       case OP_CRQUERY:
1595       case OP_CRMINQUERY:
1596       break;
1597
1598       default:                   /* Non-repeat => class must match */
1599       case OP_CRPLUS:            /* These repeats aren't empty */
1600       case OP_CRMINPLUS:
1601       return FALSE;
1602
1603       case OP_CRRANGE:
1604       case OP_CRMINRANGE:
1605       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1606       break;
1607       }
1608     break;
1609
1610     /* Opcodes that must match a character */
1611
1612     case OP_PROP:
1613     case OP_NOTPROP:
1614     case OP_EXTUNI:
1615     case OP_NOT_DIGIT:
1616     case OP_DIGIT:
1617     case OP_NOT_WHITESPACE:
1618     case OP_WHITESPACE:
1619     case OP_NOT_WORDCHAR:
1620     case OP_WORDCHAR:
1621     case OP_ANY:
1622     case OP_ANYBYTE:
1623     case OP_CHAR:
1624     case OP_CHARNC:
1625     case OP_NOT:
1626     case OP_PLUS:
1627     case OP_MINPLUS:
1628     case OP_POSPLUS:
1629     case OP_EXACT:
1630     case OP_NOTPLUS:
1631     case OP_NOTMINPLUS:
1632     case OP_NOTPOSPLUS:
1633     case OP_NOTEXACT:
1634     case OP_TYPEPLUS:
1635     case OP_TYPEMINPLUS:
1636     case OP_TYPEPOSPLUS:
1637     case OP_TYPEEXACT:
1638     return FALSE;
1639
1640     /* These are going to continue, as they may be empty, but we have to
1641     fudge the length for the \p and \P cases. */
1642
1643     case OP_TYPESTAR:
1644     case OP_TYPEMINSTAR:
1645     case OP_TYPEPOSSTAR:
1646     case OP_TYPEQUERY:
1647     case OP_TYPEMINQUERY:
1648     case OP_TYPEPOSQUERY:
1649     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1650     break;
1651
1652     /* Same for these */
1653
1654     case OP_TYPEUPTO:
1655     case OP_TYPEMINUPTO:
1656     case OP_TYPEPOSUPTO:
1657     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1658     break;
1659
1660     /* End of branch */
1661
1662     case OP_KET:
1663     case OP_KETRMAX:
1664     case OP_KETRMIN:
1665     case OP_ALT:
1666     return TRUE;
1667
1668     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1669     MINUPTO, and POSUPTO may be followed by a multibyte character */
1670
1671 #ifdef SUPPORT_UTF8
1672     case OP_STAR:
1673     case OP_MINSTAR:
1674     case OP_POSSTAR:
1675     case OP_QUERY:
1676     case OP_MINQUERY:
1677     case OP_POSQUERY:
1678     case OP_UPTO:
1679     case OP_MINUPTO:
1680     case OP_POSUPTO:
1681     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1682     break;
1683 #endif
1684     }
1685   }
1686
1687 return TRUE;
1688 }
1689
1690
1691
1692 /*************************************************
1693 *    Scan compiled regex for non-emptiness       *
1694 *************************************************/
1695
1696 /* This function is called to check for left recursive calls. We want to check
1697 the current branch of the current pattern to see if it could match the empty
1698 string. If it could, we must look outwards for branches at other levels,
1699 stopping when we pass beyond the bracket which is the subject of the recursion.
1700
1701 Arguments:
1702   code        points to start of the recursion
1703   endcode     points to where to stop (current RECURSE item)
1704   bcptr       points to the chain of current (unclosed) branch starts
1705   utf8        TRUE if in UTF-8 mode
1706
1707 Returns:      TRUE if what is matched could be empty
1708 */
1709
1710 static BOOL
1711 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1712   BOOL utf8)
1713 {
1714 while (bcptr != NULL && bcptr->current >= code)
1715   {
1716   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1717   bcptr = bcptr->outer;
1718   }
1719 return TRUE;
1720 }
1721
1722
1723
1724 /*************************************************
1725 *           Check for POSIX class syntax         *
1726 *************************************************/
1727
1728 /* This function is called when the sequence "[:" or "[." or "[=" is
1729 encountered in a character class. It checks whether this is followed by an
1730 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1731 ".]" or "=]".
1732
1733 Argument:
1734   ptr      pointer to the initial [
1735   endptr   where to return the end pointer
1736   cd       pointer to compile data
1737
1738 Returns:   TRUE or FALSE
1739 */
1740
1741 static BOOL
1742 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1743 {
1744 int terminator;          /* Don't combine these lines; the Solaris cc */
1745 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1746 if (*(++ptr) == '^') ptr++;
1747 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1748 if (*ptr == terminator && ptr[1] == ']')
1749   {
1750   *endptr = ptr;
1751   return TRUE;
1752   }
1753 return FALSE;
1754 }
1755
1756
1757
1758
1759 /*************************************************
1760 *          Check POSIX class name                *
1761 *************************************************/
1762
1763 /* This function is called to check the name given in a POSIX-style class entry
1764 such as [:alnum:].
1765
1766 Arguments:
1767   ptr        points to the first letter
1768   len        the length of the name
1769
1770 Returns:     a value representing the name, or -1 if unknown
1771 */
1772
1773 static int
1774 check_posix_name(const uschar *ptr, int len)
1775 {
1776 const char *pn = posix_names;
1777 register int yield = 0;
1778 while (posix_name_lengths[yield] != 0)
1779   {
1780   if (len == posix_name_lengths[yield] &&
1781     strncmp((const char *)ptr, pn, len) == 0) return yield;
1782   pn += posix_name_lengths[yield] + 1;
1783   yield++;
1784   }
1785 return -1;
1786 }
1787
1788
1789 /*************************************************
1790 *    Adjust OP_RECURSE items in repeated group   *
1791 *************************************************/
1792
1793 /* OP_RECURSE items contain an offset from the start of the regex to the group
1794 that is referenced. This means that groups can be replicated for fixed
1795 repetition simply by copying (because the recursion is allowed to refer to
1796 earlier groups that are outside the current group). However, when a group is
1797 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1798 it, after it has been compiled. This means that any OP_RECURSE items within it
1799 that refer to the group itself or any contained groups have to have their
1800 offsets adjusted. That one of the jobs of this function. Before it is called,
1801 the partially compiled regex must be temporarily terminated with OP_END.
1802
1803 This function has been extended with the possibility of forward references for
1804 recursions and subroutine calls. It must also check the list of such references
1805 for the group we are dealing with. If it finds that one of the recursions in
1806 the current group is on this list, it adjusts the offset in the list, not the
1807 value in the reference (which is a group number).
1808
1809 Arguments:
1810   group      points to the start of the group
1811   adjust     the amount by which the group is to be moved
1812   utf8       TRUE in UTF-8 mode
1813   cd         contains pointers to tables etc.
1814   save_hwm   the hwm forward reference pointer at the start of the group
1815
1816 Returns:     nothing
1817 */
1818
1819 static void
1820 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1821   uschar *save_hwm)
1822 {
1823 uschar *ptr = group;
1824
1825 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1826   {
1827   int offset;
1828   uschar *hc;
1829
1830   /* See if this recursion is on the forward reference list. If so, adjust the
1831   reference. */
1832
1833   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1834     {
1835     offset = GET(hc, 0);
1836     if (cd->start_code + offset == ptr + 1)
1837       {
1838       PUT(hc, 0, offset + adjust);
1839       break;
1840       }
1841     }
1842
1843   /* Otherwise, adjust the recursion offset if it's after the start of this
1844   group. */
1845
1846   if (hc >= cd->hwm)
1847     {
1848     offset = GET(ptr, 1);
1849     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1850     }
1851
1852   ptr += 1 + LINK_SIZE;
1853   }
1854 }
1855
1856
1857
1858 /*************************************************
1859 *        Insert an automatic callout point       *
1860 *************************************************/
1861
1862 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1863 callout points before each pattern item.
1864
1865 Arguments:
1866   code           current code pointer
1867   ptr            current pattern pointer
1868   cd             pointers to tables etc
1869
1870 Returns:         new code pointer
1871 */
1872
1873 static uschar *
1874 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1875 {
1876 *code++ = OP_CALLOUT;
1877 *code++ = 255;
1878 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
1879 PUT(code, LINK_SIZE, 0);                /* Default length */
1880 return code + 2*LINK_SIZE;
1881 }
1882
1883
1884
1885 /*************************************************
1886 *         Complete a callout item                *
1887 *************************************************/
1888
1889 /* A callout item contains the length of the next item in the pattern, which
1890 we can't fill in till after we have reached the relevant point. This is used
1891 for both automatic and manual callouts.
1892
1893 Arguments:
1894   previous_callout   points to previous callout item
1895   ptr                current pattern pointer
1896   cd                 pointers to tables etc
1897
1898 Returns:             nothing
1899 */
1900
1901 static void
1902 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1903 {
1904 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1905 PUT(previous_callout, 2 + LINK_SIZE, length);
1906 }
1907
1908
1909
1910 #ifdef SUPPORT_UCP
1911 /*************************************************
1912 *           Get othercase range                  *
1913 *************************************************/
1914
1915 /* This function is passed the start and end of a class range, in UTF-8 mode
1916 with UCP support. It searches up the characters, looking for internal ranges of
1917 characters in the "other" case. Each call returns the next one, updating the
1918 start address.
1919
1920 Arguments:
1921   cptr        points to starting character value; updated
1922   d           end value
1923   ocptr       where to put start of othercase range
1924   odptr       where to put end of othercase range
1925
1926 Yield:        TRUE when range returned; FALSE when no more
1927 */
1928
1929 static BOOL
1930 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1931   unsigned int *odptr)
1932 {
1933 unsigned int c, othercase, next;
1934
1935 for (c = *cptr; c <= d; c++)
1936   { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1937
1938 if (c > d) return FALSE;
1939
1940 *ocptr = othercase;
1941 next = othercase + 1;
1942
1943 for (++c; c <= d; c++)
1944   {
1945   if (_pcre_ucp_othercase(c) != next) break;
1946   next++;
1947   }
1948
1949 *odptr = next - 1;
1950 *cptr = c;
1951
1952 return TRUE;
1953 }
1954 #endif  /* SUPPORT_UCP */
1955
1956
1957
1958 /*************************************************
1959 *     Check if auto-possessifying is possible    *
1960 *************************************************/
1961
1962 /* This function is called for unlimited repeats of certain items, to see
1963 whether the next thing could possibly match the repeated item. If not, it makes
1964 sense to automatically possessify the repeated item.
1965
1966 Arguments:
1967   op_code       the repeated op code
1968   this          data for this item, depends on the opcode
1969   utf8          TRUE in UTF-8 mode
1970   utf8_char     used for utf8 character bytes, NULL if not relevant
1971   ptr           next character in pattern
1972   options       options bits
1973   cd            contains pointers to tables etc.
1974
1975 Returns:        TRUE if possessifying is wanted
1976 */
1977
1978 static BOOL
1979 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1980   const uschar *ptr, int options, compile_data *cd)
1981 {
1982 int next;
1983
1984 /* Skip whitespace and comments in extended mode */
1985
1986 if ((options & PCRE_EXTENDED) != 0)
1987   {
1988   for (;;)
1989     {
1990     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1991     if (*ptr == '#')
1992       {
1993       while (*(++ptr) != 0)
1994         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1995       }
1996     else break;
1997     }
1998   }
1999
2000 /* If the next item is one that we can handle, get its value. A non-negative
2001 value is a character, a negative value is an escape value. */
2002
2003 if (*ptr == '\\')
2004   {
2005   int temperrorcode = 0;
2006   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2007   if (temperrorcode != 0) return FALSE;
2008   ptr++;    /* Point after the escape sequence */
2009   }
2010
2011 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2012   {
2013 #ifdef SUPPORT_UTF8
2014   if (utf8) { GETCHARINC(next, ptr); } else
2015 #endif
2016   next = *ptr++;
2017   }
2018
2019 else return FALSE;
2020
2021 /* Skip whitespace and comments in extended mode */
2022
2023 if ((options & PCRE_EXTENDED) != 0)
2024   {
2025   for (;;)
2026     {
2027     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2028     if (*ptr == '#')
2029       {
2030       while (*(++ptr) != 0)
2031         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2032       }
2033     else break;
2034     }
2035   }
2036
2037 /* If the next thing is itself optional, we have to give up. */
2038
2039 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2040   return FALSE;
2041
2042 /* Now compare the next item with the previous opcode. If the previous is a
2043 positive single character match, "item" either contains the character or, if
2044 "item" is greater than 127 in utf8 mode, the character's bytes are in
2045 utf8_char. */
2046
2047
2048 /* Handle cases when the next item is a character. */
2049
2050 if (next >= 0) switch(op_code)
2051   {
2052   case OP_CHAR:
2053 #ifdef SUPPORT_UTF8
2054   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2055 #endif
2056   return item != next;
2057
2058   /* For CHARNC (caseless character) we must check the other case. If we have
2059   Unicode property support, we can use it to test the other case of
2060   high-valued characters. */
2061
2062   case OP_CHARNC:
2063 #ifdef SUPPORT_UTF8
2064   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2065 #endif
2066   if (item == next) return FALSE;
2067 #ifdef SUPPORT_UTF8
2068   if (utf8)
2069     {
2070     unsigned int othercase;
2071     if (next < 128) othercase = cd->fcc[next]; else
2072 #ifdef SUPPORT_UCP
2073     othercase = _pcre_ucp_othercase((unsigned int)next);
2074 #else
2075     othercase = NOTACHAR;
2076 #endif
2077     return (unsigned int)item != othercase;
2078     }
2079   else
2080 #endif  /* SUPPORT_UTF8 */
2081   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2082
2083   /* For OP_NOT, "item" must be a single-byte character. */
2084
2085   case OP_NOT:
2086   if (next < 0) return FALSE;  /* Not a character */
2087   if (item == next) return TRUE;
2088   if ((options & PCRE_CASELESS) == 0) return FALSE;
2089 #ifdef SUPPORT_UTF8
2090   if (utf8)
2091     {
2092     unsigned int othercase;
2093     if (next < 128) othercase = cd->fcc[next]; else
2094 #ifdef SUPPORT_UCP
2095     othercase = _pcre_ucp_othercase(next);
2096 #else
2097     othercase = NOTACHAR;
2098 #endif
2099     return (unsigned int)item == othercase;
2100     }
2101   else
2102 #endif  /* SUPPORT_UTF8 */
2103   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2104
2105   case OP_DIGIT:
2106   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2107
2108   case OP_NOT_DIGIT:
2109   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2110
2111   case OP_WHITESPACE:
2112   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2113
2114   case OP_NOT_WHITESPACE:
2115   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2116
2117   case OP_WORDCHAR:
2118   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2119
2120   case OP_NOT_WORDCHAR:
2121   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2122
2123   case OP_HSPACE:
2124   case OP_NOT_HSPACE:
2125   switch(next)
2126     {
2127     case 0x09:
2128     case 0x20:
2129     case 0xa0:
2130     case 0x1680:
2131     case 0x180e:
2132     case 0x2000:
2133     case 0x2001:
2134     case 0x2002:
2135     case 0x2003:
2136     case 0x2004:
2137     case 0x2005:
2138     case 0x2006:
2139     case 0x2007:
2140     case 0x2008:
2141     case 0x2009:
2142     case 0x200A:
2143     case 0x202f:
2144     case 0x205f:
2145     case 0x3000:
2146     return op_code != OP_HSPACE;
2147     default:
2148     return op_code == OP_HSPACE;
2149     }
2150
2151   case OP_VSPACE:
2152   case OP_NOT_VSPACE:
2153   switch(next)
2154     {
2155     case 0x0a:
2156     case 0x0b:
2157     case 0x0c:
2158     case 0x0d:
2159     case 0x85:
2160     case 0x2028:
2161     case 0x2029:
2162     return op_code != OP_VSPACE;
2163     default:
2164     return op_code == OP_VSPACE;
2165     }
2166
2167   default:
2168   return FALSE;
2169   }
2170
2171
2172 /* Handle the case when the next item is \d, \s, etc. */
2173
2174 switch(op_code)
2175   {
2176   case OP_CHAR:
2177   case OP_CHARNC:
2178 #ifdef SUPPORT_UTF8
2179   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2180 #endif
2181   switch(-next)
2182     {
2183     case ESC_d:
2184     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2185
2186     case ESC_D:
2187     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2188
2189     case ESC_s:
2190     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2191
2192     case ESC_S:
2193     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2194
2195     case ESC_w:
2196     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2197
2198     case ESC_W:
2199     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2200
2201     case ESC_h:
2202     case ESC_H:
2203     switch(item)
2204       {
2205       case 0x09:
2206       case 0x20:
2207       case 0xa0:
2208       case 0x1680:
2209       case 0x180e:
2210       case 0x2000:
2211       case 0x2001:
2212       case 0x2002:
2213       case 0x2003:
2214       case 0x2004:
2215       case 0x2005:
2216       case 0x2006:
2217       case 0x2007:
2218       case 0x2008:
2219       case 0x2009:
2220       case 0x200A:
2221       case 0x202f:
2222       case 0x205f:
2223       case 0x3000:
2224       return -next != ESC_h;
2225       default:
2226       return -next == ESC_h;
2227       }
2228
2229     case ESC_v:
2230     case ESC_V:
2231     switch(item)
2232       {
2233       case 0x0a:
2234       case 0x0b:
2235       case 0x0c:
2236       case 0x0d:
2237       case 0x85:
2238       case 0x2028:
2239       case 0x2029:
2240       return -next != ESC_v;
2241       default:
2242       return -next == ESC_v;
2243       }
2244
2245     default:
2246     return FALSE;
2247     }
2248
2249   case OP_DIGIT:
2250   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2251          next == -ESC_h || next == -ESC_v;
2252
2253   case OP_NOT_DIGIT:
2254   return next == -ESC_d;
2255
2256   case OP_WHITESPACE:
2257   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2258
2259   case OP_NOT_WHITESPACE:
2260   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2261
2262   case OP_HSPACE:
2263   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2264
2265   case OP_NOT_HSPACE:
2266   return next == -ESC_h;
2267
2268   /* Can't have \S in here because VT matches \S (Perl anomaly) */
2269   case OP_VSPACE:
2270   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2271
2272   case OP_NOT_VSPACE:
2273   return next == -ESC_v;
2274
2275   case OP_WORDCHAR:
2276   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2277
2278   case OP_NOT_WORDCHAR:
2279   return next == -ESC_w || next == -ESC_d;
2280
2281   default:
2282   return FALSE;
2283   }
2284
2285 /* Control does not reach here */
2286 }
2287
2288
2289
2290 /*************************************************
2291 *           Compile one branch                   *
2292 *************************************************/
2293
2294 /* Scan the pattern, compiling it into the a vector. If the options are
2295 changed during the branch, the pointer is used to change the external options
2296 bits. This function is used during the pre-compile phase when we are trying
2297 to find out the amount of memory needed, as well as during the real compile
2298 phase. The value of lengthptr distinguishes the two phases.
2299
2300 Arguments:
2301   optionsptr     pointer to the option bits
2302   codeptr        points to the pointer to the current code point
2303   ptrptr         points to the current pattern pointer
2304   errorcodeptr   points to error code variable
2305   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2306   reqbyteptr     set to the last literal character required, else < 0
2307   bcptr          points to current branch chain
2308   cd             contains pointers to tables etc.
2309   lengthptr      NULL during the real compile phase
2310                  points to length accumulator during pre-compile phase
2311
2312 Returns:         TRUE on success
2313                  FALSE, with *errorcodeptr set non-zero on error
2314 */
2315
2316 static BOOL
2317 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2318   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2319   compile_data *cd, int *lengthptr)
2320 {
2321 int repeat_type, op_type;
2322 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2323 int bravalue = 0;
2324 int greedy_default, greedy_non_default;
2325 int firstbyte, reqbyte;
2326 int zeroreqbyte, zerofirstbyte;
2327 int req_caseopt, reqvary, tempreqvary;
2328 int options = *optionsptr;
2329 int after_manual_callout = 0;
2330 int length_prevgroup = 0;
2331 register int c;
2332 register uschar *code = *codeptr;
2333 uschar *last_code = code;
2334 uschar *orig_code = code;
2335 uschar *tempcode;
2336 BOOL inescq = FALSE;
2337 BOOL groupsetfirstbyte = FALSE;
2338 const uschar *ptr = *ptrptr;
2339 const uschar *tempptr;
2340 uschar *previous = NULL;
2341 uschar *previous_callout = NULL;
2342 uschar *save_hwm = NULL;
2343 uschar classbits[32];
2344
2345 #ifdef SUPPORT_UTF8
2346 BOOL class_utf8;
2347 BOOL utf8 = (options & PCRE_UTF8) != 0;
2348 uschar *class_utf8data;
2349 uschar utf8_char[6];
2350 #else
2351 BOOL utf8 = FALSE;
2352 uschar *utf8_char = NULL;
2353 #endif
2354
2355 #ifdef DEBUG
2356 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2357 #endif
2358
2359 /* Set up the default and non-default settings for greediness */
2360
2361 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2362 greedy_non_default = greedy_default ^ 1;
2363
2364 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2365 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2366 matches a non-fixed char first char; reqbyte just remains unset if we never
2367 find one.
2368
2369 When we hit a repeat whose minimum is zero, we may have to adjust these values
2370 to take the zero repeat into account. This is implemented by setting them to
2371 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2372 item types that can be repeated set these backoff variables appropriately. */
2373
2374 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2375
2376 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2377 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2378 value > 255. It is added into the firstbyte or reqbyte variables to record the
2379 case status of the value. This is used only for ASCII characters. */
2380
2381 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2382
2383 /* Switch on next character until the end of the branch */
2384
2385 for (;; ptr++)
2386   {
2387   BOOL negate_class;
2388   BOOL possessive_quantifier;
2389   BOOL is_quantifier;
2390   BOOL is_recurse;
2391   BOOL reset_bracount;
2392   int class_charcount;
2393   int class_lastchar;
2394   int newoptions;
2395   int recno;
2396   int refsign;
2397   int skipbytes;
2398   int subreqbyte;
2399   int subfirstbyte;
2400   int terminator;
2401   int mclength;
2402   uschar mcbuffer[8];
2403
2404   /* Get next byte in the pattern */
2405
2406   c = *ptr;
2407
2408   /* If we are in the pre-compile phase, accumulate the length used for the
2409   previous cycle of this loop. */
2410
2411   if (lengthptr != NULL)
2412     {
2413 #ifdef DEBUG
2414     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2415 #endif
2416     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2417       {
2418       *errorcodeptr = ERR52;
2419       goto FAILED;
2420       }
2421
2422     /* There is at least one situation where code goes backwards: this is the
2423     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2424     the class is simply eliminated. However, it is created first, so we have to
2425     allow memory for it. Therefore, don't ever reduce the length at this point.
2426     */
2427
2428     if (code < last_code) code = last_code;
2429
2430     /* Paranoid check for integer overflow */
2431
2432     if (OFLOW_MAX - *lengthptr < code - last_code)
2433       {
2434       *errorcodeptr = ERR20;
2435       goto FAILED;
2436       }
2437
2438     *lengthptr += code - last_code;
2439     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2440
2441     /* If "previous" is set and it is not at the start of the work space, move
2442     it back to there, in order to avoid filling up the work space. Otherwise,
2443     if "previous" is NULL, reset the current code pointer to the start. */
2444
2445     if (previous != NULL)
2446       {
2447       if (previous > orig_code)
2448         {
2449         memmove(orig_code, previous, code - previous);
2450         code -= previous - orig_code;
2451         previous = orig_code;
2452         }
2453       }
2454     else code = orig_code;
2455
2456     /* Remember where this code item starts so we can pick up the length
2457     next time round. */
2458
2459     last_code = code;
2460     }
2461
2462   /* In the real compile phase, just check the workspace used by the forward
2463   reference list. */
2464
2465   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2466     {
2467     *errorcodeptr = ERR52;
2468     goto FAILED;
2469     }
2470
2471   /* If in \Q...\E, check for the end; if not, we have a literal */
2472
2473   if (inescq && c != 0)
2474     {
2475     if (c == '\\' && ptr[1] == 'E')
2476       {
2477       inescq = FALSE;
2478       ptr++;
2479       continue;
2480       }
2481     else
2482       {
2483       if (previous_callout != NULL)
2484         {
2485         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2486           complete_callout(previous_callout, ptr, cd);
2487         previous_callout = NULL;
2488         }
2489       if ((options & PCRE_AUTO_CALLOUT) != 0)
2490         {
2491         previous_callout = code;
2492         code = auto_callout(code, ptr, cd);
2493         }
2494       goto NORMAL_CHAR;
2495       }
2496     }
2497
2498   /* Fill in length of a previous callout, except when the next thing is
2499   a quantifier. */
2500
2501   is_quantifier = c == '*' || c == '+' || c == '?' ||
2502     (c == '{' && is_counted_repeat(ptr+1));
2503
2504   if (!is_quantifier && previous_callout != NULL &&
2505        after_manual_callout-- <= 0)
2506     {
2507     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2508       complete_callout(previous_callout, ptr, cd);
2509     previous_callout = NULL;
2510     }
2511
2512   /* In extended mode, skip white space and comments */
2513
2514   if ((options & PCRE_EXTENDED) != 0)
2515     {
2516     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2517     if (c == '#')
2518       {
2519       while (*(++ptr) != 0)
2520         {
2521         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2522         }
2523       if (*ptr != 0) continue;
2524
2525       /* Else fall through to handle end of string */
2526       c = 0;
2527       }
2528     }
2529
2530   /* No auto callout for quantifiers. */
2531
2532   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2533     {
2534     previous_callout = code;
2535     code = auto_callout(code, ptr, cd);
2536     }
2537
2538   switch(c)
2539     {
2540     /* ===================================================================*/
2541     case 0:                        /* The branch terminates at string end */
2542     case '|':                      /* or | or ) */
2543     case ')':
2544     *firstbyteptr = firstbyte;
2545     *reqbyteptr = reqbyte;
2546     *codeptr = code;
2547     *ptrptr = ptr;
2548     if (lengthptr != NULL)
2549       {
2550       if (OFLOW_MAX - *lengthptr < code - last_code)
2551         {
2552         *errorcodeptr = ERR20;
2553         goto FAILED;
2554         }
2555       *lengthptr += code - last_code;   /* To include callout length */
2556       DPRINTF((">> end branch\n"));
2557       }
2558     return TRUE;
2559
2560
2561     /* ===================================================================*/
2562     /* Handle single-character metacharacters. In multiline mode, ^ disables
2563     the setting of any following char as a first character. */
2564
2565     case '^':
2566     if ((options & PCRE_MULTILINE) != 0)
2567       {
2568       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2569       }
2570     previous = NULL;
2571     *code++ = OP_CIRC;
2572     break;
2573
2574     case '$':
2575     previous = NULL;
2576     *code++ = OP_DOLL;
2577     break;
2578
2579     /* There can never be a first char if '.' is first, whatever happens about
2580     repeats. The value of reqbyte doesn't change either. */
2581
2582     case '.':
2583     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2584     zerofirstbyte = firstbyte;
2585     zeroreqbyte = reqbyte;
2586     previous = code;
2587     *code++ = OP_ANY;
2588     break;
2589
2590
2591     /* ===================================================================*/
2592     /* Character classes. If the included characters are all < 256, we build a
2593     32-byte bitmap of the permitted characters, except in the special case
2594     where there is only one such character. For negated classes, we build the
2595     map as usual, then invert it at the end. However, we use a different opcode
2596     so that data characters > 255 can be handled correctly.
2597
2598     If the class contains characters outside the 0-255 range, a different
2599     opcode is compiled. It may optionally have a bit map for characters < 256,
2600     but those above are are explicitly listed afterwards. A flag byte tells
2601     whether the bitmap is present, and whether this is a negated class or not.
2602     */
2603
2604     case '[':
2605     previous = code;
2606
2607     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2608     they are encountered at the top level, so we'll do that too. */
2609
2610     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2611         check_posix_syntax(ptr, &tempptr, cd))
2612       {
2613       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2614       goto FAILED;
2615       }
2616
2617     /* If the first character is '^', set the negation flag and skip it. Also,
2618     if the first few characters (either before or after ^) are \Q\E or \E we
2619     skip them too. This makes for compatibility with Perl. */
2620
2621     negate_class = FALSE;
2622     for (;;)
2623       {
2624       c = *(++ptr);
2625       if (c == '\\')
2626         {
2627         if (ptr[1] == 'E') ptr++;
2628           else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2629             else break;
2630         }
2631       else if (!negate_class && c == '^')
2632         negate_class = TRUE;
2633       else break;
2634       }
2635
2636     /* Keep a count of chars with values < 256 so that we can optimize the case
2637     of just a single character (as long as it's < 256). However, For higher
2638     valued UTF-8 characters, we don't yet do any optimization. */
2639
2640     class_charcount = 0;
2641     class_lastchar = -1;
2642
2643     /* Initialize the 32-char bit map to all zeros. We build the map in a
2644     temporary bit of memory, in case the class contains only 1 character (less
2645     than 256), because in that case the compiled code doesn't use the bit map.
2646     */
2647
2648     memset(classbits, 0, 32 * sizeof(uschar));
2649
2650 #ifdef SUPPORT_UTF8
2651     class_utf8 = FALSE;                       /* No chars >= 256 */
2652     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2653 #endif
2654
2655     /* Process characters until ] is reached. By writing this as a "do" it
2656     means that an initial ] is taken as a data character. At the start of the
2657     loop, c contains the first byte of the character. */
2658
2659     if (c != 0) do
2660       {
2661       const uschar *oldptr;
2662
2663 #ifdef SUPPORT_UTF8
2664       if (utf8 && c > 127)
2665         {                           /* Braces are required because the */
2666         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2667         }
2668 #endif
2669
2670       /* Inside \Q...\E everything is literal except \E */
2671
2672       if (inescq)
2673         {
2674         if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
2675           {
2676           inescq = FALSE;                   /* Reset literal state */
2677           ptr++;                            /* Skip the 'E' */
2678           continue;                         /* Carry on with next */
2679           }
2680         goto CHECK_RANGE;                   /* Could be range if \E follows */
2681         }
2682
2683       /* Handle POSIX class names. Perl allows a negation extension of the
2684       form [:^name:]. A square bracket that doesn't match the syntax is
2685       treated as a literal. We also recognize the POSIX constructions
2686       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2687       5.6 and 5.8 do. */
2688
2689       if (c == '[' &&
2690           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2691           check_posix_syntax(ptr, &tempptr, cd))
2692         {
2693         BOOL local_negate = FALSE;
2694         int posix_class, taboffset, tabopt;
2695         register const uschar *cbits = cd->cbits;
2696         uschar pbits[32];
2697
2698         if (ptr[1] != ':')
2699           {
2700           *errorcodeptr = ERR31;
2701           goto FAILED;
2702           }
2703
2704         ptr += 2;
2705         if (*ptr == '^')
2706           {
2707           local_negate = TRUE;
2708           ptr++;
2709           }
2710
2711         posix_class = check_posix_name(ptr, tempptr - ptr);
2712         if (posix_class < 0)
2713           {
2714           *errorcodeptr = ERR30;
2715           goto FAILED;
2716           }
2717
2718         /* If matching is caseless, upper and lower are converted to
2719         alpha. This relies on the fact that the class table starts with
2720         alpha, lower, upper as the first 3 entries. */
2721
2722         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2723           posix_class = 0;
2724
2725         /* We build the bit map for the POSIX class in a chunk of local store
2726         because we may be adding and subtracting from it, and we don't want to
2727         subtract bits that may be in the main map already. At the end we or the
2728         result into the bit map that is being built. */
2729
2730         posix_class *= 3;
2731
2732         /* Copy in the first table (always present) */
2733
2734         memcpy(pbits, cbits + posix_class_maps[posix_class],
2735           32 * sizeof(uschar));
2736
2737         /* If there is a second table, add or remove it as required. */
2738
2739         taboffset = posix_class_maps[posix_class + 1];
2740         tabopt = posix_class_maps[posix_class + 2];
2741
2742         if (taboffset >= 0)
2743           {
2744           if (tabopt >= 0)
2745             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2746           else
2747             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2748           }
2749
2750         /* Not see if we need to remove any special characters. An option
2751         value of 1 removes vertical space and 2 removes underscore. */
2752
2753         if (tabopt < 0) tabopt = -tabopt;
2754         if (tabopt == 1) pbits[1] &= ~0x3c;
2755           else if (tabopt == 2) pbits[11] &= 0x7f;
2756
2757         /* Add the POSIX table or its complement into the main table that is
2758         being built and we are done. */
2759
2760         if (local_negate)
2761           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2762         else
2763           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2764
2765         ptr = tempptr + 1;
2766         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
2767         continue;    /* End of POSIX syntax handling */
2768         }
2769
2770       /* Backslash may introduce a single character, or it may introduce one
2771       of the specials, which just set a flag. The sequence \b is a special
2772       case. Inside a class (and only there) it is treated as backspace.
2773       Elsewhere it marks a word boundary. Other escapes have preset maps ready
2774       to 'or' into the one we are building. We assume they have more than one
2775       character in them, so set class_charcount bigger than one. */
2776
2777       if (c == '\\')
2778         {
2779         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2780         if (*errorcodeptr != 0) goto FAILED;
2781
2782         if (-c == ESC_b) c = '\b';       /* \b is backslash in a class */
2783         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
2784         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
2785         else if (-c == ESC_Q)            /* Handle start of quoted string */
2786           {
2787           if (ptr[1] == '\\' && ptr[2] == 'E')
2788             {
2789             ptr += 2; /* avoid empty string */
2790             }
2791           else inescq = TRUE;
2792           continue;
2793           }
2794         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
2795
2796         if (c < 0)
2797           {
2798           register const uschar *cbits = cd->cbits;
2799           class_charcount += 2;     /* Greater than 1 is what matters */
2800
2801           /* Save time by not doing this in the pre-compile phase. */
2802
2803           if (lengthptr == NULL) switch (-c)
2804             {
2805             case ESC_d:
2806             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2807             continue;
2808
2809             case ESC_D:
2810             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2811             continue;
2812
2813             case ESC_w:
2814             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2815             continue;
2816
2817             case ESC_W:
2818             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2819             continue;
2820
2821             case ESC_s:
2822             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2823             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
2824             continue;
2825
2826             case ESC_S:
2827             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2828             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
2829             continue;
2830
2831             case ESC_E: /* Perl ignores an orphan \E */
2832             continue;
2833
2834             default:    /* Not recognized; fall through */
2835             break;      /* Need "default" setting to stop compiler warning. */
2836             }
2837
2838           /* In the pre-compile phase, just do the recognition. */
2839
2840           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2841                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2842
2843           /* We need to deal with \H, \h, \V, and \v in both phases because
2844           they use extra memory. */
2845
2846           if (-c == ESC_h)
2847             {
2848             SETBIT(classbits, 0x09); /* VT */
2849             SETBIT(classbits, 0x20); /* SPACE */
2850             SETBIT(classbits, 0xa0); /* NSBP */
2851 #ifdef SUPPORT_UTF8
2852             if (utf8)
2853               {
2854               class_utf8 = TRUE;
2855               *class_utf8data++ = XCL_SINGLE;
2856               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2857               *class_utf8data++ = XCL_SINGLE;
2858               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2859               *class_utf8data++ = XCL_RANGE;
2860               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2861               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2862               *class_utf8data++ = XCL_SINGLE;
2863               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2864               *class_utf8data++ = XCL_SINGLE;
2865               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2866               *class_utf8data++ = XCL_SINGLE;
2867               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2868               }
2869 #endif
2870             continue;
2871             }
2872
2873           if (-c == ESC_H)
2874             {
2875             for (c = 0; c < 32; c++)
2876               {
2877               int x = 0xff;
2878               switch (c)
2879                 {
2880                 case 0x09/8: x ^= 1 << (0x09%8); break;
2881                 case 0x20/8: x ^= 1 << (0x20%8); break;
2882                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2883                 default: break;
2884                 }
2885               classbits[c] |= x;
2886               }
2887
2888 #ifdef SUPPORT_UTF8
2889             if (utf8)
2890               {
2891               class_utf8 = TRUE;
2892               *class_utf8data++ = XCL_RANGE;
2893               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2894               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2895               *class_utf8data++ = XCL_RANGE;
2896               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2897               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2898               *class_utf8data++ = XCL_RANGE;
2899               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2900               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2901               *class_utf8data++ = XCL_RANGE;
2902               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2903               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2904               *class_utf8data++ = XCL_RANGE;
2905               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2906               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2907               *class_utf8data++ = XCL_RANGE;
2908               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2909               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2910               *class_utf8data++ = XCL_RANGE;
2911               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2912               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2913               }
2914 #endif
2915             continue;
2916             }
2917
2918           if (-c == ESC_v)
2919             {
2920             SETBIT(classbits, 0x0a); /* LF */
2921             SETBIT(classbits, 0x0b); /* VT */
2922             SETBIT(classbits, 0x0c); /* FF */
2923             SETBIT(classbits, 0x0d); /* CR */
2924             SETBIT(classbits, 0x85); /* NEL */
2925 #ifdef SUPPORT_UTF8
2926             if (utf8)
2927               {
2928               class_utf8 = TRUE;
2929               *class_utf8data++ = XCL_RANGE;
2930               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2931               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2932               }
2933 #endif
2934             continue;
2935             }
2936
2937           if (-c == ESC_V)
2938             {
2939             for (c = 0; c < 32; c++)
2940               {
2941               int x = 0xff;
2942               switch (c)
2943                 {
2944                 case 0x0a/8: x ^= 1 << (0x0a%8);
2945                              x ^= 1 << (0x0b%8);
2946                              x ^= 1 << (0x0c%8);
2947                              x ^= 1 << (0x0d%8);
2948                              break;
2949                 case 0x85/8: x ^= 1 << (0x85%8); break;
2950                 default: break;
2951                 }
2952               classbits[c] |= x;
2953               }
2954
2955 #ifdef SUPPORT_UTF8
2956             if (utf8)
2957               {
2958               class_utf8 = TRUE;
2959               *class_utf8data++ = XCL_RANGE;
2960               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2961               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2962               *class_utf8data++ = XCL_RANGE;
2963               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2964               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2965               }
2966 #endif
2967             continue;
2968             }
2969
2970           /* We need to deal with \P and \p in both phases. */
2971
2972 #ifdef SUPPORT_UCP
2973           if (-c == ESC_p || -c == ESC_P)
2974             {
2975             BOOL negated;
2976             int pdata;
2977             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2978             if (ptype < 0) goto FAILED;
2979             class_utf8 = TRUE;
2980             *class_utf8data++ = ((-c == ESC_p) != negated)?
2981               XCL_PROP : XCL_NOTPROP;
2982             *class_utf8data++ = ptype;
2983             *class_utf8data++ = pdata;
2984             class_charcount -= 2;   /* Not a < 256 character */
2985             continue;
2986             }
2987 #endif
2988           /* Unrecognized escapes are faulted if PCRE is running in its
2989           strict mode. By default, for compatibility with Perl, they are
2990           treated as literals. */
2991
2992           if ((options & PCRE_EXTRA) != 0)
2993             {
2994             *errorcodeptr = ERR7;
2995             goto FAILED;
2996             }
2997
2998           class_charcount -= 2;  /* Undo the default count from above */
2999           c = *ptr;              /* Get the final character and fall through */
3000           }
3001
3002         /* Fall through if we have a single character (c >= 0). This may be
3003         greater than 256 in UTF-8 mode. */
3004
3005         }   /* End of backslash handling */
3006
3007       /* A single character may be followed by '-' to form a range. However,
3008       Perl does not permit ']' to be the end of the range. A '-' character
3009       at the end is treated as a literal. Perl ignores orphaned \E sequences
3010       entirely. The code for handling \Q and \E is messy. */
3011
3012       CHECK_RANGE:
3013       while (ptr[1] == '\\' && ptr[2] == 'E')
3014         {
3015         inescq = FALSE;
3016         ptr += 2;
3017         }
3018
3019       oldptr = ptr;
3020
3021       /* Remember \r or \n */
3022
3023       if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3024
3025       /* Check for range */
3026
3027       if (!inescq && ptr[1] == '-')
3028         {
3029         int d;
3030         ptr += 2;
3031         while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3032
3033         /* If we hit \Q (not followed by \E) at this point, go into escaped
3034         mode. */
3035
3036         while (*ptr == '\\' && ptr[1] == 'Q')
3037           {
3038           ptr += 2;
3039           if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3040           inescq = TRUE;
3041           break;
3042           }
3043
3044         if (*ptr == 0 || (!inescq && *ptr == ']'))
3045           {
3046           ptr = oldptr;
3047           goto LONE_SINGLE_CHARACTER;
3048           }
3049
3050 #ifdef SUPPORT_UTF8
3051         if (utf8)
3052           {                           /* Braces are required because the */
3053           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3054           }
3055         else
3056 #endif
3057         d = *ptr;  /* Not UTF-8 mode */
3058
3059         /* The second part of a range can be a single-character escape, but
3060         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3061         in such circumstances. */
3062
3063         if (!inescq && d == '\\')
3064           {
3065           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3066           if (*errorcodeptr != 0) goto FAILED;
3067
3068           /* \b is backslash; \X is literal X; \R is literal R; any other
3069           special means the '-' was literal */
3070
3071           if (d < 0)
3072             {
3073             if (d == -ESC_b) d = '\b';
3074             else if (d == -ESC_X) d = 'X';
3075             else if (d == -ESC_R) d = 'R'; else
3076               {
3077               ptr = oldptr;
3078               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3079               }
3080             }
3081           }
3082
3083         /* Check that the two values are in the correct order. Optimize
3084         one-character ranges */
3085
3086         if (d < c)
3087           {
3088           *errorcodeptr = ERR8;
3089           goto FAILED;
3090           }
3091
3092         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3093
3094         /* Remember \r or \n */
3095
3096         if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3097
3098         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3099         matching, we have to use an XCLASS with extra data items. Caseless
3100         matching for characters > 127 is available only if UCP support is
3101         available. */
3102
3103 #ifdef SUPPORT_UTF8
3104         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3105           {
3106           class_utf8 = TRUE;
3107
3108           /* With UCP support, we can find the other case equivalents of
3109           the relevant characters. There may be several ranges. Optimize how
3110           they fit with the basic range. */
3111
3112 #ifdef SUPPORT_UCP
3113           if ((options & PCRE_CASELESS) != 0)
3114             {
3115             unsigned int occ, ocd;
3116             unsigned int cc = c;
3117             unsigned int origd = d;
3118             while (get_othercase_range(&cc, origd, &occ, &ocd))
3119               {
3120               if (occ >= (unsigned int)c &&
3121                   ocd <= (unsigned int)d)
3122                 continue;                          /* Skip embedded ranges */
3123
3124               if (occ < (unsigned int)c  &&
3125                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3126                 {                                  /* if there is overlap,   */
3127                 c = occ;                           /* noting that if occ < c */
3128                 continue;                          /* we can't have ocd > d  */
3129                 }                                  /* because a subrange is  */
3130               if (ocd > (unsigned int)d &&
3131                   occ <= (unsigned int)d + 1)      /* always shorter than    */
3132                 {                                  /* the basic range.       */
3133                 d = ocd;
3134                 continue;
3135                 }
3136
3137               if (occ == ocd)
3138                 {
3139                 *class_utf8data++ = XCL_SINGLE;
3140                 }
3141               else
3142                 {
3143                 *class_utf8data++ = XCL_RANGE;
3144                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3145                 }
3146               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3147               }
3148             }
3149 #endif  /* SUPPORT_UCP */
3150
3151           /* Now record the original range, possibly modified for UCP caseless
3152           overlapping ranges. */
3153
3154           *class_utf8data++ = XCL_RANGE;
3155           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3156           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3157
3158           /* With UCP support, we are done. Without UCP support, there is no
3159           caseless matching for UTF-8 characters > 127; we can use the bit map
3160           for the smaller ones. */
3161
3162 #ifdef SUPPORT_UCP
3163           continue;    /* With next character in the class */
3164 #else
3165           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3166
3167           /* Adjust upper limit and fall through to set up the map */
3168
3169           d = 127;
3170
3171 #endif  /* SUPPORT_UCP */
3172           }
3173 #endif  /* SUPPORT_UTF8 */
3174
3175         /* We use the bit map for all cases when not in UTF-8 mode; else
3176         ranges that lie entirely within 0-127 when there is UCP support; else
3177         for partial ranges without UCP support. */
3178
3179         class_charcount += d - c + 1;
3180         class_lastchar = d;
3181
3182         /* We can save a bit of time by skipping this in the pre-compile. */
3183
3184         if (lengthptr == NULL) for (; c <= d; c++)
3185           {
3186           classbits[c/8] |= (1 << (c&7));
3187           if ((options & PCRE_CASELESS) != 0)
3188             {
3189             int uc = cd->fcc[c];           /* flip case */
3190             classbits[uc/8] |= (1 << (uc&7));
3191             }
3192           }
3193
3194         continue;   /* Go get the next char in the class */
3195         }
3196
3197       /* Handle a lone single character - we can get here for a normal
3198       non-escape char, or after \ that introduces a single character or for an
3199       apparent range that isn't. */
3200
3201       LONE_SINGLE_CHARACTER:
3202
3203       /* Handle a character that cannot go in the bit map */
3204
3205 #ifdef SUPPORT_UTF8
3206       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3207         {
3208         class_utf8 = TRUE;
3209         *class_utf8data++ = XCL_SINGLE;
3210         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3211
3212 #ifdef SUPPORT_UCP
3213         if ((options & PCRE_CASELESS) != 0)
3214           {
3215           unsigned int othercase;
3216           if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3217             {
3218             *class_utf8data++ = XCL_SINGLE;
3219             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3220             }
3221           }
3222 #endif  /* SUPPORT_UCP */
3223
3224         }
3225       else
3226 #endif  /* SUPPORT_UTF8 */
3227
3228       /* Handle a single-byte character */
3229         {
3230         classbits[c/8] |= (1 << (c&7));
3231         if ((options & PCRE_CASELESS) != 0)
3232           {
3233           c = cd->fcc[c];   /* flip case */
3234           classbits[c/8] |= (1 << (c&7));
3235           }
3236         class_charcount++;
3237         class_lastchar = c;
3238         }
3239       }
3240
3241     /* Loop until ']' reached. This "while" is the end of the "do" above. */
3242
3243     while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3244
3245     if (c == 0)                          /* Missing terminating ']' */
3246       {
3247       *errorcodeptr = ERR6;
3248       goto FAILED;
3249       }
3250
3251
3252 /* This code has been disabled because it would mean that \s counts as
3253 an explicit \r or \n reference, and that's not really what is wanted. Now
3254 we set the flag only if there is a literal "\r" or "\n" in the class. */
3255
3256 #if 0
3257     /* Remember whether \r or \n are in this class */
3258
3259     if (negate_class)
3260       {
3261       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3262       }
3263     else
3264       {
3265       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3266       }
3267 #endif
3268
3269
3270     /* If class_charcount is 1, we saw precisely one character whose value is
3271     less than 256. As long as there were no characters >= 128 and there was no
3272     use of \p or \P, in other words, no use of any XCLASS features, we can
3273     optimize.
3274
3275     In UTF-8 mode, we can optimize the negative case only if there were no
3276     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3277     operate on single-bytes only. This is an historical hangover. Maybe one day
3278     we can tidy these opcodes to handle multi-byte characters.
3279
3280     The optimization throws away the bit map. We turn the item into a
3281     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3282     that OP_NOT does not support multibyte characters. In the positive case, it
3283     can cause firstbyte to be set. Otherwise, there can be no first char if
3284     this item is first, whatever repeat count may follow. In the case of
3285     reqbyte, save the previous value for reinstating. */
3286
3287 #ifdef SUPPORT_UTF8
3288     if (class_charcount == 1 && !class_utf8 &&
3289       (!utf8 || !negate_class || class_lastchar < 128))
3290 #else
3291     if (class_charcount == 1)
3292 #endif
3293       {
3294       zeroreqbyte = reqbyte;
3295
3296       /* The OP_NOT opcode works on one-byte characters only. */
3297
3298       if (negate_class)
3299         {
3300         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3301         zerofirstbyte = firstbyte;
3302         *code++ = OP_NOT;
3303         *code++ = class_lastchar;
3304         break;
3305         }
3306
3307       /* For a single, positive character, get the value into mcbuffer, and
3308       then we can handle this with the normal one-character code. */
3309
3310 #ifdef SUPPORT_UTF8
3311       if (utf8 && class_lastchar > 127)
3312         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3313       else
3314 #endif
3315         {
3316         mcbuffer[0] = class_lastchar;
3317         mclength = 1;
3318         }
3319       goto ONE_CHAR;
3320       }       /* End of 1-char optimization */
3321
3322     /* The general case - not the one-char optimization. If this is the first
3323     thing in the branch, there can be no first char setting, whatever the
3324     repeat count. Any reqbyte setting must remain unchanged after any kind of
3325     repeat. */
3326
3327     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3328     zerofirstbyte = firstbyte;
3329     zeroreqbyte = reqbyte;
3330
3331     /* If there are characters with values > 255, we have to compile an
3332     extended class, with its own opcode. If there are no characters < 256,
3333     we can omit the bitmap in the actual compiled code. */
3334
3335 #ifdef SUPPORT_UTF8
3336     if (class_utf8)
3337       {
3338       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3339       *code++ = OP_XCLASS;
3340       code += LINK_SIZE;
3341       *code = negate_class? XCL_NOT : 0;
3342
3343       /* If the map is required, move up the extra data to make room for it;
3344       otherwise just move the code pointer to the end of the extra data. */
3345
3346       if (class_charcount > 0)
3347         {
3348         *code++ |= XCL_MAP;
3349         memmove(code + 32, code, class_utf8data - code);
3350         memcpy(code, classbits, 32);
3351         code = class_utf8data + 32;
3352         }
3353       else code = class_utf8data;
3354
3355       /* Now fill in the complete length of the item */
3356
3357       PUT(previous, 1, code - previous);
3358       break;   /* End of class handling */
3359       }
3360 #endif
3361
3362     /* If there are no characters > 255, negate the 32-byte map if necessary,
3363     and copy it into the code vector. If this is the first thing in the branch,
3364     there can be no first char setting, whatever the repeat count. Any reqbyte
3365     setting must remain unchanged after any kind of repeat. */
3366
3367     if (negate_class)
3368       {
3369       *code++ = OP_NCLASS;
3370       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3371         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3372       }
3373     else
3374       {
3375       *code++ = OP_CLASS;
3376       memcpy(code, classbits, 32);
3377       }
3378     code += 32;
3379     break;
3380
3381
3382     /* ===================================================================*/
3383     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3384     has been tested above. */
3385
3386     case '{':
3387     if (!is_quantifier) goto NORMAL_CHAR;
3388     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3389     if (*errorcodeptr != 0) goto FAILED;
3390     goto REPEAT;
3391
3392     case '*':
3393     repeat_min = 0;
3394     repeat_max = -1;
3395     goto REPEAT;
3396
3397     case '+':
3398     repeat_min = 1;
3399     repeat_max = -1;
3400     goto REPEAT;
3401
3402     case '?':
3403     repeat_min = 0;
3404     repeat_max = 1;
3405
3406     REPEAT:
3407     if (previous == NULL)
3408       {
3409       *errorcodeptr = ERR9;
3410       goto FAILED;
3411       }
3412
3413     if (repeat_min == 0)
3414       {
3415       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3416       reqbyte = zeroreqbyte;        /* Ditto */
3417       }
3418
3419     /* Remember whether this is a variable length repeat */
3420
3421     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3422
3423     op_type = 0;                    /* Default single-char op codes */
3424     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3425
3426     /* Save start of previous item, in case we have to move it up to make space
3427     for an inserted OP_ONCE for the additional '+' extension. */
3428
3429     tempcode = previous;
3430
3431     /* If the next character is '+', we have a possessive quantifier. This
3432     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3433     If the next character is '?' this is a minimizing repeat, by default,
3434     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3435     repeat type to the non-default. */
3436
3437     if (ptr[1] == '+')
3438       {
3439       repeat_type = 0;                  /* Force greedy */
3440       possessive_quantifier = TRUE;
3441       ptr++;
3442       }
3443     else if (ptr[1] == '?')
3444       {
3445       repeat_type = greedy_non_default;
3446       ptr++;
3447       }
3448     else repeat_type = greedy_default;
3449
3450     /* If previous was a character match, abolish the item and generate a
3451     repeat item instead. If a char item has a minumum of more than one, ensure
3452     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3453     the first thing in a branch because the x will have gone into firstbyte
3454     instead.  */
3455
3456     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3457       {
3458       /* Deal with UTF-8 characters that take up more than one byte. It's
3459       easier to write this out separately than try to macrify it. Use c to
3460       hold the length of the character in bytes, plus 0x80 to flag that it's a
3461       length rather than a small character. */
3462
3463 #ifdef SUPPORT_UTF8
3464       if (utf8 && (code[-1] & 0x80) != 0)
3465         {
3466         uschar *lastchar = code - 1;
3467         while((*lastchar & 0xc0) == 0x80) lastchar--;
3468         c = code - lastchar;            /* Length of UTF-8 character */
3469         memcpy(utf8_char, lastchar, c); /* Save the char */
3470         c |= 0x80;                      /* Flag c as a length */
3471         }
3472       else
3473 #endif
3474
3475       /* Handle the case of a single byte - either with no UTF8 support, or
3476       with UTF-8 disabled, or for a UTF-8 character < 128. */
3477
3478         {
3479         c = code[-1];
3480         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3481         }
3482
3483       /* If the repetition is unlimited, it pays to see if the next thing on
3484       the line is something that cannot possibly match this character. If so,
3485       automatically possessifying this item gains some performance in the case
3486       where the match fails. */
3487
3488       if (!possessive_quantifier &&
3489           repeat_max < 0 &&
3490           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3491             options, cd))
3492         {
3493         repeat_type = 0;    /* Force greedy */
3494         possessive_quantifier = TRUE;
3495         }
3496
3497       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3498       }
3499
3500     /* If previous was a single negated character ([^a] or similar), we use
3501     one of the special opcodes, replacing it. The code is shared with single-
3502     character repeats by setting opt_type to add a suitable offset into
3503     repeat_type. We can also test for auto-possessification. OP_NOT is
3504     currently used only for single-byte chars. */
3505
3506     else if (*previous == OP_NOT)
3507       {
3508       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3509       c = previous[1];
3510       if (!possessive_quantifier &&
3511           repeat_max < 0 &&
3512           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3513         {
3514         repeat_type = 0;    /* Force greedy */
3515         possessive_quantifier = TRUE;
3516         }
3517       goto OUTPUT_SINGLE_REPEAT;
3518       }
3519
3520     /* If previous was a character type match (\d or similar), abolish it and
3521     create a suitable repeat item. The code is shared with single-character
3522     repeats by setting op_type to add a suitable offset into repeat_type. Note
3523     the the Unicode property types will be present only when SUPPORT_UCP is
3524     defined, but we don't wrap the little bits of code here because it just
3525     makes it horribly messy. */
3526
3527     else if (*previous < OP_EODN)
3528       {
3529       uschar *oldcode;
3530       int prop_type, prop_value;
3531       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3532       c = *previous;
3533
3534       if (!possessive_quantifier &&
3535           repeat_max < 0 &&
3536           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3537         {
3538         repeat_type = 0;    /* Force greedy */
3539         possessive_quantifier = TRUE;
3540         }
3541
3542       OUTPUT_SINGLE_REPEAT:
3543       if (*previous == OP_PROP || *previous == OP_NOTPROP)
3544         {
3545         prop_type = previous[1];
3546         prop_value = previous[2];
3547         }
3548       else prop_type = prop_value = -1;
3549
3550       oldcode = code;
3551       code = previous;                  /* Usually overwrite previous item */
3552
3553       /* If the maximum is zero then the minimum must also be zero; Perl allows
3554       this case, so we do too - by simply omitting the item altogether. */
3555
3556       if (repeat_max == 0) goto END_REPEAT;
3557
3558       /* All real repeats make it impossible to handle partial matching (maybe
3559       one day we will be able to remove this restriction). */
3560
3561       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3562
3563       /* Combine the op_type with the repeat_type */
3564
3565       repeat_type += op_type;
3566
3567       /* A minimum of zero is handled either as the special case * or ?, or as
3568       an UPTO, with the maximum given. */
3569
3570       if (repeat_min == 0)
3571         {
3572         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3573           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3574         else
3575           {
3576           *code++ = OP_UPTO + repeat_type;
3577           PUT2INC(code, 0, repeat_max);
3578           }
3579         }
3580
3581       /* A repeat minimum of 1 is optimized into some special cases. If the
3582       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3583       left in place and, if the maximum is greater than 1, we use OP_UPTO with
3584       one less than the maximum. */
3585
3586       else if (repeat_min == 1)
3587         {
3588         if (repeat_max == -1)
3589           *code++ = OP_PLUS + repeat_type;
3590         else
3591           {
3592           code = oldcode;                 /* leave previous item in place */
3593           if (repeat_max == 1) goto END_REPEAT;
3594           *code++ = OP_UPTO + repeat_type;
3595           PUT2INC(code, 0, repeat_max - 1);
3596           }
3597         }
3598
3599       /* The case {n,n} is just an EXACT, while the general case {n,m} is
3600       handled as an EXACT followed by an UPTO. */
3601
3602       else
3603         {
3604         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3605         PUT2INC(code, 0, repeat_min);
3606
3607         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3608         we have to insert the character for the previous code. For a repeated
3609         Unicode property match, there are two extra bytes that define the
3610         required property. In UTF-8 mode, long characters have their length in
3611         c, with the 0x80 bit as a flag. */
3612
3613         if (repeat_max < 0)
3614           {
3615 #ifdef SUPPORT_UTF8
3616           if (utf8 && c >= 128)
3617             {
3618             memcpy(code, utf8_char, c & 7);
3619             code += c & 7;
3620             }
3621           else
3622 #endif
3623             {
3624             *code++ = c;
3625             if (prop_type >= 0)
3626               {
3627               *code++ = prop_type;
3628               *code++ = prop_value;
3629               }
3630             }
3631           *code++ = OP_STAR + repeat_type;
3632           }
3633
3634         /* Else insert an UPTO if the max is greater than the min, again
3635         preceded by the character, for the previously inserted code. If the
3636         UPTO is just for 1 instance, we can use QUERY instead. */
3637
3638         else if (repeat_max != repeat_min)
3639           {
3640 #ifdef SUPPORT_UTF8
3641           if (utf8 && c >= 128)
3642             {
3643             memcpy(code, utf8_char, c & 7);
3644             code += c & 7;
3645             }
3646           else
3647 #endif
3648           *code++ = c;
3649           if (prop_type >= 0)
3650             {
3651             *code++ = prop_type;
3652             *code++ = prop_value;
3653             }
3654           repeat_max -= repeat_min;
3655
3656           if (repeat_max == 1)
3657             {
3658             *code++ = OP_QUERY + repeat_type;
3659             }
3660           else
3661             {
3662             *code++ = OP_UPTO + repeat_type;
3663             PUT2INC(code, 0, repeat_max);
3664             }
3665           }
3666         }
3667
3668       /* The character or character type itself comes last in all cases. */
3669
3670 #ifdef SUPPORT_UTF8
3671       if (utf8 && c >= 128)
3672         {
3673         memcpy(code, utf8_char, c & 7);
3674         code += c & 7;
3675         }
3676       else
3677 #endif
3678       *code++ = c;
3679
3680       /* For a repeated Unicode property match, there are two extra bytes that
3681       define the required property. */
3682
3683 #ifdef SUPPORT_UCP
3684       if (prop_type >= 0)
3685         {
3686         *code++ = prop_type;
3687         *code++ = prop_value;
3688         }
3689 #endif
3690       }
3691
3692     /* If previous was a character class or a back reference, we put the repeat
3693     stuff after it, but just skip the item if the repeat was {0,0}. */
3694
3695     else if (*previous == OP_CLASS ||
3696              *previous == OP_NCLASS ||
3697 #ifdef SUPPORT_UTF8
3698              *previous == OP_XCLASS ||
3699 #endif
3700              *previous == OP_REF)
3701       {
3702       if (repeat_max == 0)
3703         {
3704         code = previous;
3705         goto END_REPEAT;
3706         }
3707
3708       /* All real repeats make it impossible to handle partial matching (maybe
3709       one day we will be able to remove this restriction). */
3710
3711       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3712
3713       if (repeat_min == 0 && repeat_max == -1)
3714         *code++ = OP_CRSTAR + repeat_type;
3715       else if (repeat_min == 1 && repeat_max == -1)
3716         *code++ = OP_CRPLUS + repeat_type;
3717       else if (repeat_min == 0 && repeat_max == 1)
3718         *code++ = OP_CRQUERY + repeat_type;
3719       else
3720         {
3721         *code++ = OP_CRRANGE + repeat_type;
3722         PUT2INC(code, 0, repeat_min);
3723         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
3724         PUT2INC(code, 0, repeat_max);
3725         }
3726       }
3727
3728     /* If previous was a bracket group, we may have to replicate it in certain
3729     cases. */
3730
3731     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
3732              *previous == OP_ONCE || *previous == OP_COND)
3733       {
3734       register int i;
3735       int ketoffset = 0;
3736       int len = code - previous;
3737       uschar *bralink = NULL;
3738
3739       /* Repeating a DEFINE group is pointless */
3740
3741       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3742         {
3743         *errorcodeptr = ERR55;
3744         goto FAILED;
3745         }
3746
3747       /* If the maximum repeat count is unlimited, find the end of the bracket
3748       by scanning through from the start, and compute the offset back to it
3749       from the current code pointer. There may be an OP_OPT setting following
3750       the final KET, so we can't find the end just by going back from the code
3751       pointer. */
3752
3753       if (repeat_max == -1)
3754         {
3755         register uschar *ket = previous;
3756         do ket += GET(ket, 1); while (*ket != OP_KET);
3757         ketoffset = code - ket;
3758         }
3759
3760       /* The case of a zero minimum is special because of the need to stick
3761       OP_BRAZERO in front of it, and because the group appears once in the
3762       data, whereas in other cases it appears the minimum number of times. For
3763       this reason, it is simplest to treat this case separately, as otherwise
3764       the code gets far too messy. There are several special subcases when the
3765       minimum is zero. */
3766
3767       if (repeat_min == 0)
3768         {
3769         /* If the maximum is also zero, we just omit the group from the output
3770         altogether. */
3771
3772         if (repeat_max == 0)
3773           {
3774           code = previous;
3775           goto END_REPEAT;
3776           }
3777
3778         /* If the maximum is 1 or unlimited, we just have to stick in the
3779         BRAZERO and do no more at this point. However, we do need to adjust
3780         any OP_RECURSE calls inside the group that refer to the group itself or
3781         any internal or forward referenced group, because the offset is from
3782         the start of the whole regex. Temporarily terminate the pattern while
3783         doing this. */
3784
3785         if (repeat_max <= 1)
3786           {
3787           *code = OP_END;
3788           adjust_recurse(previous, 1, utf8, cd, save_hwm);
3789           memmove(previous+1, previous, len);
3790           code++;
3791           *previous++ = OP_BRAZERO + repeat_type;
3792           }
3793
3794         /* If the maximum is greater than 1 and limited, we have to replicate
3795         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3796         The first one has to be handled carefully because it's the original
3797         copy, which has to be moved up. The remainder can be handled by code
3798         that is common with the non-zero minimum case below. We have to
3799         adjust the value or repeat_max, since one less copy is required. Once
3800         again, we may have to adjust any OP_RECURSE calls inside the group. */
3801
3802         else
3803           {
3804           int offset;
3805           *code = OP_END;
3806           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3807           memmove(previous + 2 + LINK_SIZE, previous, len);
3808           code += 2 + LINK_SIZE;
3809           *previous++ = OP_BRAZERO + repeat_type;
3810           *previous++ = OP_BRA;
3811
3812           /* We chain together the bracket offset fields that have to be
3813           filled in later when the ends of the brackets are reached. */
3814
3815           offset = (bralink == NULL)? 0 : previous - bralink;
3816           bralink = previous;
3817           PUTINC(previous, 0, offset);
3818           }
3819
3820         repeat_max--;
3821         }
3822
3823       /* If the minimum is greater than zero, replicate the group as many
3824       times as necessary, and adjust the maximum to the number of subsequent
3825       copies that we need. If we set a first char from the group, and didn't
3826       set a required char, copy the latter from the former. If there are any
3827       forward reference subroutine calls in the group, there will be entries on
3828       the workspace list; replicate these with an appropriate increment. */
3829
3830       else
3831         {
3832         if (repeat_min > 1)
3833           {
3834           /* In the pre-compile phase, we don't actually do the replication. We
3835           just adjust the length as if we had. Do some paranoid checks for
3836           potential integer overflow. */
3837
3838           if (lengthptr != NULL)
3839             {
3840             int delta = (repeat_min - 1)*length_prevgroup;
3841             if ((double)(repeat_min - 1)*(double)length_prevgroup >
3842                                                             (double)INT_MAX ||
3843                 OFLOW_MAX - *lengthptr < delta)
3844               {
3845               *errorcodeptr = ERR20;
3846               goto FAILED;
3847               }
3848             *lengthptr += delta;
3849             }
3850
3851           /* This is compiling for real */
3852
3853           else
3854             {
3855             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3856             for (i = 1; i < repeat_min; i++)
3857               {
3858               uschar *hc;
3859               uschar *this_hwm = cd->hwm;
3860               memcpy(code, previous, len);
3861               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3862                 {
3863                 PUT(cd->hwm, 0, GET(hc, 0) + len);
3864                 cd->hwm += LINK_SIZE;
3865                 }
3866               save_hwm = this_hwm;
3867               code += len;
3868               }
3869             }
3870           }
3871
3872         if (repeat_max > 0) repeat_max -= repeat_min;
3873         }
3874
3875       /* This code is common to both the zero and non-zero minimum cases. If
3876       the maximum is limited, it replicates the group in a nested fashion,
3877       remembering the bracket starts on a stack. In the case of a zero minimum,
3878       the first one was set up above. In all cases the repeat_max now specifies
3879       the number of additional copies needed. Again, we must remember to
3880       replicate entries on the forward reference list. */
3881
3882       if (repeat_max >= 0)
3883         {
3884         /* In the pre-compile phase, we don't actually do the replication. We
3885         just adjust the length as if we had. For each repetition we must add 1
3886         to the length for BRAZERO and for all but the last repetition we must
3887         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3888         paranoid checks to avoid integer overflow. */
3889
3890         if (lengthptr != NULL && repeat_max > 0)
3891           {
3892           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3893                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
3894           if ((double)repeat_max *
3895                 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3896                   > (double)INT_MAX ||
3897               OFLOW_MAX - *lengthptr < delta)
3898             {
3899             *errorcodeptr = ERR20;
3900             goto FAILED;
3901             }
3902           *lengthptr += delta;
3903           }
3904
3905         /* This is compiling for real */
3906
3907         else for (i = repeat_max - 1; i >= 0; i--)
3908           {
3909           uschar *hc;
3910           uschar *this_hwm = cd->hwm;
3911
3912           *code++ = OP_BRAZERO + repeat_type;
3913
3914           /* All but the final copy start a new nesting, maintaining the
3915           chain of brackets outstanding. */
3916
3917           if (i != 0)
3918             {
3919             int offset;
3920             *code++ = OP_BRA;
3921             offset = (bralink == NULL)? 0 : code - bralink;
3922             bralink = code;
3923             PUTINC(code, 0, offset);
3924             }
3925
3926           memcpy(code, previous, len);
3927           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3928             {
3929             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3930             cd->hwm += LINK_SIZE;
3931             }
3932           save_hwm = this_hwm;
3933           code += len;
3934           }
3935
3936         /* Now chain through the pending brackets, and fill in their length
3937         fields (which are holding the chain links pro tem). */
3938
3939         while (bralink != NULL)
3940           {
3941           int oldlinkoffset;
3942           int offset = code - bralink + 1;
3943           uschar *bra = code - offset;
3944           oldlinkoffset = GET(bra, 1);
3945           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3946           *code++ = OP_KET;
3947           PUTINC(code, 0, offset);
3948           PUT(bra, 1, offset);
3949           }
3950         }
3951
3952       /* If the maximum is unlimited, set a repeater in the final copy. We
3953       can't just offset backwards from the current code point, because we
3954       don't know if there's been an options resetting after the ket. The
3955       correct offset was computed above.
3956
3957       Then, when we are doing the actual compile phase, check to see whether
3958       this group is a non-atomic one that could match an empty string. If so,
3959       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3960       that runtime checking can be done. [This check is also applied to
3961       atomic groups at runtime, but in a different way.] */
3962
3963       else
3964         {
3965         uschar *ketcode = code - ketoffset;
3966         uschar *bracode = ketcode - GET(ketcode, 1);
3967         *ketcode = OP_KETRMAX + repeat_type;
3968         if (lengthptr == NULL && *bracode != OP_ONCE)
3969           {
3970           uschar *scode = bracode;
3971           do
3972             {
3973             if (could_be_empty_branch(scode, ketcode, utf8))
3974               {
3975               *bracode += OP_SBRA - OP_BRA;
3976               break;
3977               }
3978             scode += GET(scode, 1);
3979             }
3980           while (*scode == OP_ALT);
3981           }
3982         }
3983       }
3984
3985     /* Else there's some kind of shambles */
3986
3987     else
3988       {
3989       *errorcodeptr = ERR11;
3990       goto FAILED;
3991       }
3992
3993     /* If the character following a repeat is '+', or if certain optimization
3994     tests above succeeded, possessive_quantifier is TRUE. For some of the
3995     simpler opcodes, there is an special alternative opcode for this. For
3996     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3997     The '+' notation is just syntactic sugar, taken from Sun's Java package,
3998     but the special opcodes can optimize it a bit. The repeated item starts at
3999     tempcode, not at previous, which might be the first part of a string whose
4000     (former) last char we repeated.
4001
4002     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4003     an 'upto' may follow. We skip over an 'exact' item, and then test the
4004     length of what remains before proceeding. */
4005
4006     if (possessive_quantifier)
4007       {
4008       int len;
4009       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4010           *tempcode == OP_NOTEXACT)
4011         tempcode += _pcre_OP_lengths[*tempcode];
4012       len = code - tempcode;
4013       if (len > 0) switch (*tempcode)
4014         {
4015         case OP_STAR:  *tempcode = OP_POSSTAR; break;
4016         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4017         case OP_QUERY: *tempcode = OP_POSQUERY; break;
4018         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4019
4020         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4021         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4022         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4023         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4024
4025         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4026         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4027         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4028         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4029
4030         default:
4031         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4032         code += 1 + LINK_SIZE;
4033         len += 1 + LINK_SIZE;
4034         tempcode[0] = OP_ONCE;
4035         *code++ = OP_KET;
4036         PUTINC(code, 0, len);
4037         PUT(tempcode, 1, len);
4038         break;
4039         }
4040       }
4041
4042     /* In all case we no longer have a previous item. We also set the
4043     "follows varying string" flag for subsequently encountered reqbytes if
4044     it isn't already set and we have just passed a varying length item. */
4045
4046     END_REPEAT:
4047     previous = NULL;
4048     cd->req_varyopt |= reqvary;
4049     break;
4050
4051
4052     /* ===================================================================*/
4053     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4054     lookbehind or option setting or condition or all the other extended
4055     parenthesis forms.  */
4056
4057     case '(':
4058     newoptions = options;
4059     skipbytes = 0;
4060     bravalue = OP_CBRA;
4061     save_hwm = cd->hwm;
4062     reset_bracount = FALSE;
4063
4064     /* First deal with various "verbs" that can be introduced by '*'. */
4065
4066     if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4067       {
4068       int i, namelen;
4069       const char *vn = verbnames;
4070       const uschar *name = ++ptr;
4071       previous = NULL;
4072       while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4073       if (*ptr == ':')
4074         {
4075         *errorcodeptr = ERR59;   /* Not supported */
4076         goto FAILED;
4077         }
4078       if (*ptr != ')')
4079         {
4080         *errorcodeptr = ERR60;
4081         goto FAILED;
4082         }
4083       namelen = ptr - name;
4084       for (i = 0; i < verbcount; i++)
4085         {
4086         if (namelen == verbs[i].len &&
4087             strncmp((char *)name, vn, namelen) == 0)
4088           {
4089           *code = verbs[i].op;
4090           if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4091           break;
4092           }
4093         vn += verbs[i].len + 1;
4094         }
4095       if (i < verbcount) continue;
4096       *errorcodeptr = ERR60;
4097       goto FAILED;
4098       }
4099
4100     /* Deal with the extended parentheses; all are introduced by '?', and the
4101     appearance of any of them means that this is not a capturing group. */
4102
4103     else if (*ptr == '?')
4104       {
4105       int i, set, unset, namelen;
4106       int *optset;
4107       const uschar *name;
4108       uschar *slot;
4109
4110       switch (*(++ptr))
4111         {
4112         case '#':                 /* Comment; skip to ket */
4113         ptr++;
4114         while (*ptr != 0 && *ptr != ')') ptr++;
4115         if (*ptr == 0)
4116           {
4117           *errorcodeptr = ERR18;
4118           goto FAILED;
4119           }
4120         continue;
4121
4122
4123         /* ------------------------------------------------------------ */
4124         case '|':                 /* Reset capture count for each branch */
4125         reset_bracount = TRUE;
4126         /* Fall through */
4127
4128         /* ------------------------------------------------------------ */
4129         case ':':                 /* Non-capturing bracket */
4130         bravalue = OP_BRA;
4131         ptr++;
4132         break;
4133
4134
4135         /* ------------------------------------------------------------ */
4136         case '(':
4137         bravalue = OP_COND;       /* Conditional group */
4138
4139         /* A condition can be an assertion, a number (referring to a numbered
4140         group), a name (referring to a named group), or 'R', referring to
4141         recursion. R<digits> and R&name are also permitted for recursion tests.
4142
4143         There are several syntaxes for testing a named group: (?(name)) is used
4144         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4145
4146         There are two unfortunate ambiguities, caused by history. (a) 'R' can
4147         be the recursive thing or the name 'R' (and similarly for 'R' followed
4148         by digits), and (b) a number could be a name that consists of digits.
4149         In both cases, we look for a name first; if not found, we try the other
4150         cases. */
4151
4152         /* For conditions that are assertions, check the syntax, and then exit
4153         the switch. This will take control down to where bracketed groups,
4154         including assertions, are processed. */
4155
4156         if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4157           break;
4158
4159         /* Most other conditions use OP_CREF (a couple change to OP_RREF
4160         below), and all need to skip 3 bytes at the start of the group. */
4161
4162         code[1+LINK_SIZE] = OP_CREF;
4163         skipbytes = 3;
4164         refsign = -1;
4165
4166         /* Check for a test for recursion in a named group. */
4167
4168         if (ptr[1] == 'R' && ptr[2] == '&')
4169           {
4170           terminator = -1;
4171           ptr += 2;
4172           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4173           }
4174
4175         /* Check for a test for a named group's having been set, using the Perl
4176         syntax (?(<name>) or (?('name') */
4177
4178         else if (ptr[1] == '<')
4179           {
4180           terminator = '>';
4181           ptr++;
4182           }
4183         else if (ptr[1] == '\'')
4184           {
4185           terminator = '\'';
4186           ptr++;
4187           }
4188         else
4189           {
4190           terminator = 0;
4191           if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4192           }
4193
4194         /* We now expect to read a name; any thing else is an error */
4195
4196         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4197           {
4198           ptr += 1;  /* To get the right offset */
4199           *errorcodeptr = ERR28;
4200           goto FAILED;
4201           }
4202
4203         /* Read the name, but also get it as a number if it's all digits */
4204
4205         recno = 0;
4206         name = ++ptr;
4207         while ((cd->ctypes[*ptr] & ctype_word) != 0)
4208           {
4209           if (recno >= 0)
4210             recno = ((digitab[*ptr] & ctype_digit) != 0)?
4211               recno * 10 + *ptr - '0' : -1;
4212           ptr++;
4213           }
4214         namelen = ptr - name;
4215
4216         if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4217           {
4218           ptr--;      /* Error offset */
4219           *errorcodeptr = ERR26;
4220           goto FAILED;
4221           }
4222
4223         /* Do no further checking in the pre-compile phase. */
4224
4225         if (lengthptr != NULL) break;
4226
4227         /* In the real compile we do the work of looking for the actual
4228         reference. If the string started with "+" or "-" we require the rest to
4229         be digits, in which case recno will be set. */
4230
4231         if (refsign > 0)
4232           {
4233           if (recno <= 0)
4234             {
4235             *errorcodeptr = ERR58;
4236             goto FAILED;
4237             }
4238           if (refsign == '-')
4239             {
4240             recno = cd->bracount - recno + 1;
4241             if (recno <= 0)
4242               {
4243               *errorcodeptr = ERR15;
4244               goto FAILED;
4245               }
4246             }
4247           else recno += cd->bracount;
4248           PUT2(code, 2+LINK_SIZE, recno);
4249           break;
4250           }
4251
4252         /* Otherwise (did not start with "+" or "-"), start by looking for the
4253         name. */
4254
4255         slot = cd->name_table;
4256         for (i = 0; i < cd->names_found; i++)
4257           {
4258           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4259           slot += cd->name_entry_size;
4260           }
4261
4262         /* Found a previous named subpattern */
4263
4264         if (i < cd->names_found)
4265           {
4266           recno = GET2(slot, 0);
4267           PUT2(code, 2+LINK_SIZE, recno);
4268           }
4269
4270         /* Search the pattern for a forward reference */
4271
4272         else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4273                         (options & PCRE_EXTENDED) != 0)) > 0)
4274           {
4275           PUT2(code, 2+LINK_SIZE, i);
4276           }
4277
4278         /* If terminator == 0 it means that the name followed directly after
4279         the opening parenthesis [e.g. (?(abc)...] and in this case there are
4280         some further alternatives to try. For the cases where terminator != 0
4281         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4282         now checked all the possibilities, so give an error. */
4283
4284         else if (terminator != 0)
4285           {
4286           *errorcodeptr = ERR15;
4287           goto FAILED;
4288           }
4289
4290         /* Check for (?(R) for recursion. Allow digits after R to specify a
4291         specific group number. */
4292
4293         else if (*name == 'R')
4294           {
4295           recno = 0;
4296           for (i = 1; i < namelen; i++)
4297             {
4298             if ((digitab[name[i]] & ctype_digit) == 0)
4299               {
4300               *errorcodeptr = ERR15;
4301               goto FAILED;
4302               }
4303             recno = recno * 10 + name[i] - '0';
4304             }
4305           if (recno == 0) recno = RREF_ANY;
4306           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4307           PUT2(code, 2+LINK_SIZE, recno);
4308           }
4309
4310         /* Similarly, check for the (?(DEFINE) "condition", which is always
4311         false. */
4312
4313         else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4314           {
4315           code[1+LINK_SIZE] = OP_DEF;
4316           skipbytes = 1;
4317           }
4318
4319         /* Check for the "name" actually being a subpattern number. */
4320
4321         else if (recno > 0)
4322           {
4323           PUT2(code, 2+LINK_SIZE, recno);
4324           }
4325
4326         /* Either an unidentified subpattern, or a reference to (?(0) */
4327
4328         else
4329           {
4330           *errorcodeptr = (recno == 0)? ERR35: ERR15;
4331           goto FAILED;
4332           }
4333         break;
4334
4335
4336         /* ------------------------------------------------------------ */
4337         case '=':                 /* Positive lookahead */
4338         bravalue = OP_ASSERT;
4339         ptr++;
4340         break;
4341
4342
4343         /* ------------------------------------------------------------ */
4344         case '!':                 /* Negative lookahead */
4345         ptr++;
4346         if (*ptr == ')')          /* Optimize (?!) */
4347           {
4348           *code++ = OP_FAIL;
4349           previous = NULL;
4350           continue;
4351           }
4352         bravalue = OP_ASSERT_NOT;
4353         break;
4354
4355
4356         /* ------------------------------------------------------------ */
4357         case '<':                 /* Lookbehind or named define */
4358         switch (ptr[1])
4359           {
4360           case '=':               /* Positive lookbehind */
4361           bravalue = OP_ASSERTBACK;
4362           ptr += 2;
4363           break;
4364
4365           case '!':               /* Negative lookbehind */
4366           bravalue = OP_ASSERTBACK_NOT;
4367           ptr += 2;
4368           break;
4369
4370           default:                /* Could be name define, else bad */
4371           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4372           ptr++;                  /* Correct offset for error */
4373           *errorcodeptr = ERR24;
4374           goto FAILED;
4375           }
4376         break;
4377
4378
4379         /* ------------------------------------------------------------ */
4380         case '>':                 /* One-time brackets */
4381         bravalue = OP_ONCE;
4382         ptr++;
4383         break;
4384
4385
4386         /* ------------------------------------------------------------ */
4387         case 'C':                 /* Callout - may be followed by digits; */
4388         previous_callout = code;  /* Save for later completion */
4389         after_manual_callout = 1; /* Skip one item before completing */
4390         *code++ = OP_CALLOUT;
4391           {
4392           int n = 0;
4393           while ((digitab[*(++ptr)] & ctype_digit) != 0)
4394             n = n * 10 + *ptr - '0';
4395           if (*ptr != ')')
4396             {
4397             *errorcodeptr = ERR39;
4398             goto FAILED;
4399             }
4400           if (n > 255)
4401             {
4402             *errorcodeptr = ERR38;
4403             goto FAILED;
4404             }
4405           *code++ = n;
4406           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4407           PUT(code, LINK_SIZE, 0);                    /* Default length */
4408           code += 2 * LINK_SIZE;
4409           }
4410         previous = NULL;
4411         continue;
4412
4413
4414         /* ------------------------------------------------------------ */
4415         case 'P':                 /* Python-style named subpattern handling */
4416         if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
4417           {
4418           is_recurse = *ptr == '>';
4419           terminator = ')';
4420           goto NAMED_REF_OR_RECURSE;
4421           }
4422         else if (*ptr != '<')    /* Test for Python-style definition */
4423           {
4424           *errorcodeptr = ERR41;
4425           goto FAILED;
4426           }
4427         /* Fall through to handle (?P< as (?< is handled */
4428
4429
4430         /* ------------------------------------------------------------ */
4431         DEFINE_NAME:    /* Come here from (?< handling */
4432         case '\'':
4433           {
4434           terminator = (*ptr == '<')? '>' : '\'';
4435           name = ++ptr;
4436
4437           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4438           namelen = ptr - name;
4439
4440           /* In the pre-compile phase, just do a syntax check. */
4441
4442           if (lengthptr != NULL)
4443             {
4444             if (*ptr != terminator)
4445               {
4446               *errorcodeptr = ERR42;
4447               goto FAILED;
4448               }
4449             if (cd->names_found >= MAX_NAME_COUNT)
4450               {
4451               *errorcodeptr = ERR49;
4452               goto FAILED;
4453               }
4454             if (namelen + 3 > cd->name_entry_size)
4455               {
4456               cd->name_entry_size = namelen + 3;
4457               if (namelen > MAX_NAME_SIZE)
4458                 {
4459                 *errorcodeptr = ERR48;
4460                 goto FAILED;
4461                 }
4462               }
4463             }
4464
4465           /* In the real compile, create the entry in the table */
4466
4467           else
4468             {
4469             slot = cd->name_table;
4470             for (i = 0; i < cd->names_found; i++)
4471               {
4472               int crc = memcmp(name, slot+2, namelen);
4473               if (crc == 0)
4474                 {
4475                 if (slot[2+namelen] == 0)
4476                   {
4477                   if ((options & PCRE_DUPNAMES) == 0)
4478                     {
4479                     *errorcodeptr = ERR43;
4480                     goto FAILED;
4481                     }
4482                   }
4483                 else crc = -1;      /* Current name is substring */
4484                 }
4485               if (crc < 0)
4486                 {
4487                 memmove(slot + cd->name_entry_size, slot,
4488                   (cd->names_found - i) * cd->name_entry_size);
4489                 break;
4490                 }
4491               slot += cd->name_entry_size;
4492               }
4493
4494             PUT2(slot, 0, cd->bracount + 1);
4495             memcpy(slot + 2, name, namelen);
4496             slot[2+namelen] = 0;
4497             }
4498           }
4499
4500         /* In both cases, count the number of names we've encountered. */
4501
4502         ptr++;                    /* Move past > or ' */
4503         cd->names_found++;
4504         goto NUMBERED_GROUP;
4505
4506
4507         /* ------------------------------------------------------------ */
4508         case '&':                 /* Perl recursion/subroutine syntax */
4509         terminator = ')';
4510         is_recurse = TRUE;
4511         /* Fall through */
4512
4513         /* We come here from the Python syntax above that handles both
4514         references (?P=name) and recursion (?P>name), as well as falling
4515         through from the Perl recursion syntax (?&name). */
4516
4517         NAMED_REF_OR_RECURSE:
4518         name = ++ptr;
4519         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4520         namelen = ptr - name;
4521
4522         /* In the pre-compile phase, do a syntax check and set a dummy
4523         reference number. */
4524
4525         if (lengthptr != NULL)
4526           {
4527           if (*ptr != terminator)
4528             {
4529             *errorcodeptr = ERR42;
4530             goto FAILED;
4531             }
4532           if (namelen > MAX_NAME_SIZE)
4533             {
4534             *errorcodeptr = ERR48;
4535             goto FAILED;
4536             }
4537           recno = 0;
4538           }
4539
4540         /* In the real compile, seek the name in the table */
4541
4542         else
4543           {
4544           slot = cd->name_table;
4545           for (i = 0; i < cd->names_found; i++)
4546             {
4547             if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4548             slot += cd->name_entry_size;
4549             }
4550
4551           if (i < cd->names_found)         /* Back reference */
4552             {
4553             recno = GET2(slot, 0);
4554             }
4555           else if ((recno =                /* Forward back reference */
4556                     find_parens(ptr, cd->bracount, name, namelen,
4557                       (options & PCRE_EXTENDED) != 0)) <= 0)
4558             {
4559             *errorcodeptr = ERR15;
4560             goto FAILED;
4561             }
4562           }
4563
4564         /* In both phases, we can now go to the code than handles numerical
4565         recursion or backreferences. */
4566
4567         if (is_recurse) goto HANDLE_RECURSION;
4568           else goto HANDLE_REFERENCE;
4569
4570
4571         /* ------------------------------------------------------------ */
4572         case 'R':                 /* Recursion */
4573         ptr++;                    /* Same as (?0)      */
4574         /* Fall through */
4575
4576
4577         /* ------------------------------------------------------------ */
4578         case '-': case '+':
4579         case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
4580         case '5': case '6': case '7': case '8': case '9':   /* subroutine */
4581           {
4582           const uschar *called;
4583
4584           if ((refsign = *ptr) == '+') ptr++;
4585           else if (refsign == '-')
4586             {
4587             if ((digitab[ptr[1]] & ctype_digit) == 0)
4588               goto OTHER_CHAR_AFTER_QUERY;
4589             ptr++;
4590             }
4591
4592           recno = 0;
4593           while((digitab[*ptr] & ctype_digit) != 0)
4594             recno = recno * 10 + *ptr++ - '0';
4595
4596           if (*ptr != ')')
4597             {
4598             *errorcodeptr = ERR29;
4599             goto FAILED;
4600             }
4601
4602           if (refsign == '-')
4603             {
4604             if (recno == 0)
4605               {
4606               *errorcodeptr = ERR58;
4607               goto FAILED;
4608               }
4609             recno = cd->bracount - recno + 1;
4610             if (recno <= 0)
4611               {
4612               *errorcodeptr = ERR15;
4613               goto FAILED;
4614               }
4615             }
4616           else if (refsign == '+')
4617             {
4618             if (recno == 0)
4619               {
4620               *errorcodeptr = ERR58;
4621               goto FAILED;
4622               }
4623             recno += cd->bracount;
4624             }
4625
4626           /* Come here from code above that handles a named recursion */
4627
4628           HANDLE_RECURSION:
4629
4630           previous = code;
4631           called = cd->start_code;
4632
4633           /* When we are actually compiling, find the bracket that is being
4634           referenced. Temporarily end the regex in case it doesn't exist before
4635           this point. If we end up with a forward reference, first check that
4636           the bracket does occur later so we can give the error (and position)
4637           now. Then remember this forward reference in the workspace so it can
4638           be filled in at the end. */
4639
4640           if (lengthptr == NULL)
4641             {
4642             *code = OP_END;
4643             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4644
4645             /* Forward reference */
4646
4647             if (called == NULL)
4648               {
4649               if (find_parens(ptr, cd->bracount, NULL, recno,
4650                    (options & PCRE_EXTENDED) != 0) < 0)
4651                 {
4652                 *errorcodeptr = ERR15;
4653                 goto FAILED;
4654                 }
4655               called = cd->start_code + recno;
4656               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4657               }
4658
4659             /* If not a forward reference, and the subpattern is still open,
4660             this is a recursive call. We check to see if this is a left
4661             recursion that could loop for ever, and diagnose that case. */
4662
4663             else if (GET(called, 1) == 0 &&
4664                      could_be_empty(called, code, bcptr, utf8))
4665               {
4666               *errorcodeptr = ERR40;
4667               goto FAILED;
4668               }
4669             }
4670
4671           /* Insert the recursion/subroutine item, automatically wrapped inside
4672           "once" brackets. Set up a "previous group" length so that a
4673           subsequent quantifier will work. */
4674
4675           *code = OP_ONCE;
4676           PUT(code, 1, 2 + 2*LINK_SIZE);
4677           code += 1 + LINK_SIZE;
4678
4679           *code = OP_RECURSE;
4680           PUT(code, 1, called - cd->start_code);
4681           code += 1 + LINK_SIZE;
4682
4683           *code = OP_KET;
4684           PUT(code, 1, 2 + 2*LINK_SIZE);
4685           code += 1 + LINK_SIZE;
4686
4687           length_prevgroup = 3 + 3*LINK_SIZE;
4688           }
4689
4690         /* Can't determine a first byte now */
4691
4692         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4693         continue;
4694
4695
4696         /* ------------------------------------------------------------ */
4697         default:              /* Other characters: check option setting */
4698         OTHER_CHAR_AFTER_QUERY:
4699         set = unset = 0;
4700         optset = &set;
4701
4702         while (*ptr != ')' && *ptr != ':')
4703           {
4704           switch (*ptr++)
4705             {
4706             case '-': optset = &unset; break;
4707
4708             case 'J':    /* Record that it changed in the external options */
4709             *optset |= PCRE_DUPNAMES;
4710             cd->external_flags |= PCRE_JCHANGED;
4711             break;
4712
4713             case 'i': *optset |= PCRE_CASELESS; break;
4714             case 'm': *optset |= PCRE_MULTILINE; break;
4715             case 's': *optset |= PCRE_DOTALL; break;
4716             case 'x': *optset |= PCRE_EXTENDED; break;
4717             case 'U': *optset |= PCRE_UNGREEDY; break;
4718             case 'X': *optset |= PCRE_EXTRA; break;
4719
4720             default:  *errorcodeptr = ERR12;
4721                       ptr--;    /* Correct the offset */
4722                       goto FAILED;
4723             }
4724           }
4725
4726         /* Set up the changed option bits, but don't change anything yet. */
4727
4728         newoptions = (options | set) & (~unset);
4729
4730         /* If the options ended with ')' this is not the start of a nested
4731         group with option changes, so the options change at this level. If this
4732         item is right at the start of the pattern, the options can be
4733         abstracted and made external in the pre-compile phase, and ignored in
4734         the compile phase. This can be helpful when matching -- for instance in
4735         caseless checking of required bytes.
4736
4737         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4738         definitely *not* at the start of the pattern because something has been
4739         compiled. In the pre-compile phase, however, the code pointer can have
4740         that value after the start, because it gets reset as code is discarded
4741         during the pre-compile. However, this can happen only at top level - if
4742         we are within parentheses, the starting BRA will still be present. At
4743         any parenthesis level, the length value can be used to test if anything
4744         has been compiled at that level. Thus, a test for both these conditions
4745         is necessary to ensure we correctly detect the start of the pattern in
4746         both phases.
4747
4748         If we are not at the pattern start, compile code to change the ims
4749         options if this setting actually changes any of them. We also pass the
4750         new setting back so that it can be put at the start of any following
4751         branches, and when this group ends (if we are in a group), a resetting
4752         item can be compiled. */
4753
4754         if (*ptr == ')')
4755           {
4756           if (code == cd->start_code + 1 + LINK_SIZE &&
4757                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4758             {
4759             cd->external_options = newoptions;
4760             options = newoptions;
4761             }
4762          else
4763             {
4764             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4765               {
4766               *code++ = OP_OPT;
4767               *code++ = newoptions & PCRE_IMS;
4768               }
4769
4770             /* Change options at this level, and pass them back for use
4771             in subsequent branches. Reset the greedy defaults and the case
4772             value for firstbyte and reqbyte. */
4773
4774             *optionsptr = options = newoptions;
4775             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4776             greedy_non_default = greedy_default ^ 1;
4777             req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4778             }
4779
4780           previous = NULL;       /* This item can't be repeated */
4781           continue;              /* It is complete */
4782           }
4783
4784         /* If the options ended with ':' we are heading into a nested group
4785         with possible change of options. Such groups are non-capturing and are
4786         not assertions of any kind. All we need to do is skip over the ':';
4787         the newoptions value is handled below. */
4788
4789         bravalue = OP_BRA;
4790         ptr++;
4791         }     /* End of switch for character following (? */
4792       }       /* End of (? handling */
4793
4794     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4795     all unadorned brackets become non-capturing and behave like (?:...)
4796     brackets. */
4797
4798     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4799       {
4800       bravalue = OP_BRA;
4801       }
4802
4803     /* Else we have a capturing group. */
4804
4805     else
4806       {
4807       NUMBERED_GROUP:
4808       cd->bracount += 1;
4809       PUT2(code, 1+LINK_SIZE, cd->bracount);
4810       skipbytes = 2;
4811       }
4812
4813     /* Process nested bracketed regex. Assertions may not be repeated, but
4814     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4815     non-register variable in order to be able to pass its address because some
4816     compilers complain otherwise. Pass in a new setting for the ims options if
4817     they have changed. */
4818
4819     previous = (bravalue >= OP_ONCE)? code : NULL;
4820     *code = bravalue;
4821     tempcode = code;
4822     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
4823     length_prevgroup = 0;              /* Initialize for pre-compile phase */
4824
4825     if (!compile_regex(
4826          newoptions,                   /* The complete new option state */
4827          options & PCRE_IMS,           /* The previous ims option state */
4828          &tempcode,                    /* Where to put code (updated) */
4829          &ptr,                         /* Input pointer (updated) */
4830          errorcodeptr,                 /* Where to put an error message */
4831          (bravalue == OP_ASSERTBACK ||
4832           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4833          reset_bracount,               /* True if (?| group */
4834          skipbytes,                    /* Skip over bracket number */
4835          &subfirstbyte,                /* For possible first char */
4836          &subreqbyte,                  /* For possible last char */
4837          bcptr,                        /* Current branch chain */
4838          cd,                           /* Tables block */
4839          (lengthptr == NULL)? NULL :   /* Actual compile phase */
4840            &length_prevgroup           /* Pre-compile phase */
4841          ))
4842       goto FAILED;
4843
4844     /* At the end of compiling, code is still pointing to the start of the
4845     group, while tempcode has been updated to point past the end of the group
4846     and any option resetting that may follow it. The pattern pointer (ptr)
4847     is on the bracket. */
4848
4849     /* If this is a conditional bracket, check that there are no more than
4850     two branches in the group, or just one if it's a DEFINE group. We do this
4851     in the real compile phase, not in the pre-pass, where the whole group may
4852     not be available. */
4853
4854     if (bravalue == OP_COND && lengthptr == NULL)
4855       {
4856       uschar *tc = code;
4857       int condcount = 0;
4858
4859       do {
4860          condcount++;
4861          tc += GET(tc,1);
4862          }
4863       while (*tc != OP_KET);
4864
4865       /* A DEFINE group is never obeyed inline (the "condition" is always
4866       false). It must have only one branch. */
4867
4868       if (code[LINK_SIZE+1] == OP_DEF)
4869         {
4870         if (condcount > 1)
4871           {
4872           *errorcodeptr = ERR54;
4873           goto FAILED;
4874           }
4875         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
4876         }
4877
4878       /* A "normal" conditional group. If there is just one branch, we must not
4879       make use of its firstbyte or reqbyte, because this is equivalent to an
4880       empty second branch. */
4881
4882       else
4883         {
4884         if (condcount > 2)
4885           {
4886           *errorcodeptr = ERR27;
4887           goto FAILED;
4888           }
4889         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4890         }
4891       }
4892
4893     /* Error if hit end of pattern */
4894
4895     if (*ptr != ')')
4896       {
4897       *errorcodeptr = ERR14;
4898       goto FAILED;
4899       }
4900
4901     /* In the pre-compile phase, update the length by the length of the group,
4902     less the brackets at either end. Then reduce the compiled code to just a
4903     set of non-capturing brackets so that it doesn't use much memory if it is
4904     duplicated by a quantifier.*/
4905
4906     if (lengthptr != NULL)
4907       {
4908       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4909         {
4910         *errorcodeptr = ERR20;
4911         goto FAILED;
4912         }
4913       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4914       *code++ = OP_BRA;
4915       PUTINC(code, 0, 1 + LINK_SIZE);
4916       *code++ = OP_KET;
4917       PUTINC(code, 0, 1 + LINK_SIZE);
4918       break;    /* No need to waste time with special character handling */
4919       }
4920
4921     /* Otherwise update the main code pointer to the end of the group. */
4922
4923     code = tempcode;
4924
4925     /* For a DEFINE group, required and first character settings are not
4926     relevant. */
4927
4928     if (bravalue == OP_DEF) break;
4929
4930     /* Handle updating of the required and first characters for other types of
4931     group. Update for normal brackets of all kinds, and conditions with two
4932     branches (see code above). If the bracket is followed by a quantifier with
4933     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4934     zerofirstbyte outside the main loop so that they can be accessed for the
4935     back off. */
4936
4937     zeroreqbyte = reqbyte;
4938     zerofirstbyte = firstbyte;
4939     groupsetfirstbyte = FALSE;
4940
4941     if (bravalue >= OP_ONCE)
4942       {
4943       /* If we have not yet set a firstbyte in this branch, take it from the
4944       subpattern, remembering that it was set here so that a repeat of more
4945       than one can replicate it as reqbyte if necessary. If the subpattern has
4946       no firstbyte, set "none" for the whole branch. In both cases, a zero
4947       repeat forces firstbyte to "none". */
4948
4949       if (firstbyte == REQ_UNSET)
4950         {
4951         if (subfirstbyte >= 0)
4952           {
4953           firstbyte = subfirstbyte;
4954           groupsetfirstbyte = TRUE;
4955           }
4956         else firstbyte = REQ_NONE;
4957         zerofirstbyte = REQ_NONE;
4958         }
4959
4960       /* If firstbyte was previously set, convert the subpattern's firstbyte
4961       into reqbyte if there wasn't one, using the vary flag that was in
4962       existence beforehand. */
4963
4964       else if (subfirstbyte >= 0 && subreqbyte < 0)
4965         subreqbyte = subfirstbyte | tempreqvary;
4966
4967       /* If the subpattern set a required byte (or set a first byte that isn't
4968       really the first byte - see above), set it. */
4969
4970       if (subreqbyte >= 0) reqbyte = subreqbyte;
4971       }
4972
4973     /* For a forward assertion, we take the reqbyte, if set. This can be
4974     helpful if the pattern that follows the assertion doesn't set a different
4975     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4976     for an assertion, however because it leads to incorrect effect for patterns
4977     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4978     of a firstbyte. This is overcome by a scan at the end if there's no
4979     firstbyte, looking for an asserted first char. */
4980
4981     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4982     break;     /* End of processing '(' */
4983
4984
4985     /* ===================================================================*/
4986     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4987     are arranged to be the negation of the corresponding OP_values. For the
4988     back references, the values are ESC_REF plus the reference number. Only
4989     back references and those types that consume a character may be repeated.
4990     We can test for values between ESC_b and ESC_Z for the latter; this may
4991     have to change if any new ones are ever created. */
4992
4993     case '\\':
4994     tempptr = ptr;
4995     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4996     if (*errorcodeptr != 0) goto FAILED;
4997
4998     if (c < 0)
4999       {
5000       if (-c == ESC_Q)            /* Handle start of quoted string */
5001         {
5002         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5003           else inescq = TRUE;
5004         continue;
5005         }
5006
5007       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5008
5009       /* For metasequences that actually match a character, we disable the
5010       setting of a first character if it hasn't already been set. */
5011
5012       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5013         firstbyte = REQ_NONE;
5014
5015       /* Set values to reset to if this is followed by a zero repeat. */
5016
5017       zerofirstbyte = firstbyte;
5018       zeroreqbyte = reqbyte;
5019
5020       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5021       We also support \k{name} (.NET syntax) */
5022
5023       if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5024         {
5025         is_recurse = FALSE;
5026         terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5027         goto NAMED_REF_OR_RECURSE;
5028         }
5029
5030       /* Back references are handled specially; must disable firstbyte if
5031       not set to cope with cases like (?=(\w+))\1: which would otherwise set
5032       ':' later. */
5033
5034       if (-c >= ESC_REF)
5035         {
5036         recno = -c - ESC_REF;
5037
5038         HANDLE_REFERENCE:    /* Come here from named backref handling */
5039         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5040         previous = code;
5041         *code++ = OP_REF;
5042         PUT2INC(code, 0, recno);
5043         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5044         if (recno > cd->top_backref) cd->top_backref = recno;
5045         }
5046
5047       /* So are Unicode property matches, if supported. */
5048
5049 #ifdef SUPPORT_UCP
5050       else if (-c == ESC_P || -c == ESC_p)
5051         {
5052         BOOL negated;
5053         int pdata;
5054         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5055         if (ptype < 0) goto FAILED;
5056         previous = code;
5057         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5058         *code++ = ptype;
5059         *code++ = pdata;
5060         }
5061 #else
5062
5063       /* If Unicode properties are not supported, \X, \P, and \p are not
5064       allowed. */
5065
5066       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5067         {
5068         *errorcodeptr = ERR45;
5069         goto FAILED;
5070         }
5071 #endif
5072
5073       /* For the rest (including \X when Unicode properties are supported), we
5074       can obtain the OP value by negating the escape value. */
5075
5076       else
5077         {
5078         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5079         *code++ = -c;
5080         }
5081       continue;
5082       }
5083
5084     /* We have a data character whose value is in c. In UTF-8 mode it may have
5085     a value > 127. We set its representation in the length/buffer, and then
5086     handle it as a data character. */
5087
5088 #ifdef SUPPORT_UTF8
5089     if (utf8 && c > 127)
5090       mclength = _pcre_ord2utf8(c, mcbuffer);
5091     else
5092 #endif
5093
5094      {
5095      mcbuffer[0] = c;
5096      mclength = 1;
5097      }
5098     goto ONE_CHAR;
5099
5100
5101     /* ===================================================================*/
5102     /* Handle a literal character. It is guaranteed not to be whitespace or #
5103     when the extended flag is set. If we are in UTF-8 mode, it may be a
5104     multi-byte literal character. */
5105
5106     default:
5107     NORMAL_CHAR:
5108     mclength = 1;
5109     mcbuffer[0] = c;
5110
5111 #ifdef SUPPORT_UTF8
5112     if (utf8 && c >= 0xc0)
5113       {
5114       while ((ptr[1] & 0xc0) == 0x80)
5115         mcbuffer[mclength++] = *(++ptr);
5116       }
5117 #endif
5118
5119     /* At this point we have the character's bytes in mcbuffer, and the length
5120     in mclength. When not in UTF-8 mode, the length is always 1. */
5121
5122     ONE_CHAR:
5123     previous = code;
5124     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5125     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5126
5127     /* Remember if \r or \n were seen */
5128
5129     if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5130       cd->external_flags |= PCRE_HASCRORLF;
5131
5132     /* Set the first and required bytes appropriately. If no previous first
5133     byte, set it from this character, but revert to none on a zero repeat.
5134     Otherwise, leave the firstbyte value alone, and don't change it on a zero
5135     repeat. */
5136
5137     if (firstbyte == REQ_UNSET)
5138       {
5139       zerofirstbyte = REQ_NONE;
5140       zeroreqbyte = reqbyte;
5141
5142       /* If the character is more than one byte long, we can set firstbyte
5143       only if it is not to be matched caselessly. */
5144
5145       if (mclength == 1 || req_caseopt == 0)
5146         {
5147         firstbyte = mcbuffer[0] | req_caseopt;
5148         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5149         }
5150       else firstbyte = reqbyte = REQ_NONE;
5151       }
5152
5153     /* firstbyte was previously set; we can set reqbyte only the length is
5154     1 or the matching is caseful. */
5155
5156     else
5157       {
5158       zerofirstbyte = firstbyte;
5159       zeroreqbyte = reqbyte;
5160       if (mclength == 1 || req_caseopt == 0)
5161         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5162       }
5163
5164     break;            /* End of literal character handling */
5165     }
5166   }                   /* end of big loop */
5167
5168
5169 /* Control never reaches here by falling through, only by a goto for all the
5170 error states. Pass back the position in the pattern so that it can be displayed
5171 to the user for diagnosing the error. */
5172
5173 FAILED:
5174 *ptrptr = ptr;
5175 return FALSE;
5176 }
5177
5178
5179
5180
5181 /*************************************************
5182 *     Compile sequence of alternatives           *
5183 *************************************************/
5184
5185 /* On entry, ptr is pointing past the bracket character, but on return it
5186 points to the closing bracket, or vertical bar, or end of string. The code
5187 variable is pointing at the byte into which the BRA operator has been stored.
5188 If the ims options are changed at the start (for a (?ims: group) or during any
5189 branch, we need to insert an OP_OPT item at the start of every following branch
5190 to ensure they get set correctly at run time, and also pass the new options
5191 into every subsequent branch compile.
5192
5193 This function is used during the pre-compile phase when we are trying to find
5194 out the amount of memory needed, as well as during the real compile phase. The
5195 value of lengthptr distinguishes the two phases.
5196
5197 Arguments:
5198   options        option bits, including any changes for this subpattern
5199   oldims         previous settings of ims option bits
5200   codeptr        -> the address of the current code pointer
5201   ptrptr         -> the address of the current pattern pointer
5202   errorcodeptr   -> pointer to error code variable
5203   lookbehind     TRUE if this is a lookbehind assertion
5204   reset_bracount TRUE to reset the count for each branch
5205   skipbytes      skip this many bytes at start (for brackets and OP_COND)
5206   firstbyteptr   place to put the first required character, or a negative number
5207   reqbyteptr     place to put the last required character, or a negative number
5208   bcptr          pointer to the chain of currently open branches
5209   cd             points to the data block with tables pointers etc.
5210   lengthptr      NULL during the real compile phase
5211                  points to length accumulator during pre-compile phase
5212
5213 Returns:         TRUE on success
5214 */
5215
5216 static BOOL
5217 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5218   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5219   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5220   int *lengthptr)
5221 {
5222 const uschar *ptr = *ptrptr;
5223 uschar *code = *codeptr;
5224 uschar *last_branch = code;
5225 uschar *start_bracket = code;
5226 uschar *reverse_count = NULL;
5227 int firstbyte, reqbyte;
5228 int branchfirstbyte, branchreqbyte;
5229 int length;
5230 int orig_bracount;
5231 int max_bracount;
5232 branch_chain bc;
5233
5234 bc.outer = bcptr;
5235 bc.current = code;
5236
5237 firstbyte = reqbyte = REQ_UNSET;
5238
5239 /* Accumulate the length for use in the pre-compile phase. Start with the
5240 length of the BRA and KET and any extra bytes that are required at the
5241 beginning. We accumulate in a local variable to save frequent testing of
5242 lenthptr for NULL. We cannot do this by looking at the value of code at the
5243 start and end of each alternative, because compiled items are discarded during
5244 the pre-compile phase so that the work space is not exceeded. */
5245
5246 length = 2 + 2*LINK_SIZE + skipbytes;
5247
5248 /* WARNING: If the above line is changed for any reason, you must also change
5249 the code that abstracts option settings at the start of the pattern and makes
5250 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5251 pre-compile phase to find out whether anything has yet been compiled or not. */
5252
5253 /* Offset is set zero to mark that this bracket is still open */
5254
5255 PUT(code, 1, 0);
5256 code += 1 + LINK_SIZE + skipbytes;
5257
5258 /* Loop for each alternative branch */
5259
5260 orig_bracount = max_bracount = cd->bracount;
5261 for (;;)
5262   {
5263   /* For a (?| group, reset the capturing bracket count so that each branch
5264   uses the same numbers. */
5265
5266   if (reset_bracount) cd->bracount = orig_bracount;
5267
5268   /* Handle a change of ims options at the start of the branch */
5269
5270   if ((options & PCRE_IMS) != oldims)
5271     {
5272     *code++ = OP_OPT;
5273     *code++ = options & PCRE_IMS;
5274     length += 2;
5275     }
5276
5277   /* Set up dummy OP_REVERSE if lookbehind assertion */
5278
5279   if (lookbehind)
5280     {
5281     *code++ = OP_REVERSE;
5282     reverse_count = code;
5283     PUTINC(code, 0, 0);
5284     length += 1 + LINK_SIZE;
5285     }
5286
5287   /* Now compile the branch; in the pre-compile phase its length gets added
5288   into the length. */
5289
5290   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5291         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5292     {
5293     *ptrptr = ptr;
5294     return FALSE;
5295     }
5296
5297   /* Keep the highest bracket count in case (?| was used and some branch
5298   has fewer than the rest. */
5299
5300   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5301
5302   /* In the real compile phase, there is some post-processing to be done. */
5303
5304   if (lengthptr == NULL)
5305     {
5306     /* If this is the first branch, the firstbyte and reqbyte values for the
5307     branch become the values for the regex. */
5308
5309     if (*last_branch != OP_ALT)
5310       {
5311       firstbyte = branchfirstbyte;
5312       reqbyte = branchreqbyte;
5313       }
5314
5315     /* If this is not the first branch, the first char and reqbyte have to
5316     match the values from all the previous branches, except that if the
5317     previous value for reqbyte didn't have REQ_VARY set, it can still match,
5318     and we set REQ_VARY for the regex. */
5319
5320     else
5321       {
5322       /* If we previously had a firstbyte, but it doesn't match the new branch,
5323       we have to abandon the firstbyte for the regex, but if there was
5324       previously no reqbyte, it takes on the value of the old firstbyte. */
5325
5326       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5327         {
5328         if (reqbyte < 0) reqbyte = firstbyte;
5329         firstbyte = REQ_NONE;
5330         }
5331
5332       /* If we (now or from before) have no firstbyte, a firstbyte from the
5333       branch becomes a reqbyte if there isn't a branch reqbyte. */
5334
5335       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5336           branchreqbyte = branchfirstbyte;
5337
5338       /* Now ensure that the reqbytes match */
5339
5340       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5341         reqbyte = REQ_NONE;
5342       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5343       }
5344
5345     /* If lookbehind, check that this branch matches a fixed-length string, and
5346     put the length into the OP_REVERSE item. Temporarily mark the end of the
5347     branch with OP_END. */
5348
5349     if (lookbehind)
5350       {
5351       int fixed_length;
5352       *code = OP_END;
5353       fixed_length = find_fixedlength(last_branch, options);
5354       DPRINTF(("fixed length = %d\n", fixed_length));
5355       if (fixed_length < 0)
5356         {
5357         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5358         *ptrptr = ptr;
5359         return FALSE;
5360         }
5361       PUT(reverse_count, 0, fixed_length);
5362       }
5363     }
5364
5365   /* Reached end of expression, either ')' or end of pattern. In the real
5366   compile phase, go back through the alternative branches and reverse the chain
5367   of offsets, with the field in the BRA item now becoming an offset to the
5368   first alternative. If there are no alternatives, it points to the end of the
5369   group. The length in the terminating ket is always the length of the whole
5370   bracketed item. If any of the ims options were changed inside the group,
5371   compile a resetting op-code following, except at the very end of the pattern.
5372   Return leaving the pointer at the terminating char. */
5373
5374   if (*ptr != '|')
5375     {
5376     if (lengthptr == NULL)
5377       {
5378       int branch_length = code - last_branch;
5379       do
5380         {
5381         int prev_length = GET(last_branch, 1);
5382         PUT(last_branch, 1, branch_length);
5383         branch_length = prev_length;
5384         last_branch -= branch_length;
5385         }
5386       while (branch_length > 0);
5387       }
5388
5389     /* Fill in the ket */
5390
5391     *code = OP_KET;
5392     PUT(code, 1, code - start_bracket);
5393     code += 1 + LINK_SIZE;
5394
5395     /* Resetting option if needed */
5396
5397     if ((options & PCRE_IMS) != oldims && *ptr == ')')
5398       {
5399       *code++ = OP_OPT;
5400       *code++ = oldims;
5401       length += 2;
5402       }
5403
5404     /* Retain the highest bracket number, in case resetting was used. */
5405
5406     cd->bracount = max_bracount;
5407
5408     /* Set values to pass back */
5409
5410     *codeptr = code;
5411     *ptrptr = ptr;
5412     *firstbyteptr = firstbyte;
5413     *reqbyteptr = reqbyte;
5414     if (lengthptr != NULL)
5415       {
5416       if (OFLOW_MAX - *lengthptr < length)
5417         {
5418         *errorcodeptr = ERR20;
5419         return FALSE;
5420         }
5421       *lengthptr += length;
5422       }
5423     return TRUE;
5424     }
5425
5426   /* Another branch follows. In the pre-compile phase, we can move the code
5427   pointer back to where it was for the start of the first branch. (That is,
5428   pretend that each branch is the only one.)
5429
5430   In the real compile phase, insert an ALT node. Its length field points back
5431   to the previous branch while the bracket remains open. At the end the chain
5432   is reversed. It's done like this so that the start of the bracket has a
5433   zero offset until it is closed, making it possible to detect recursion. */
5434
5435   if (lengthptr != NULL)
5436     {
5437     code = *codeptr + 1 + LINK_SIZE + skipbytes;
5438     length += 1 + LINK_SIZE;
5439     }
5440   else
5441     {
5442     *code = OP_ALT;
5443     PUT(code, 1, code - last_branch);
5444     bc.current = last_branch = code;
5445     code += 1 + LINK_SIZE;
5446     }
5447
5448   ptr++;
5449   }
5450 /* Control never reaches here */
5451 }
5452
5453
5454
5455
5456 /*************************************************
5457 *          Check for anchored expression         *
5458 *************************************************/
5459
5460 /* Try to find out if this is an anchored regular expression. Consider each
5461 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5462 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5463 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5464 counts, since OP_CIRC can match in the middle.
5465
5466 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5467 This is the code for \G, which means "match at start of match position, taking
5468 into account the match offset".
5469
5470 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5471 because that will try the rest of the pattern at all possible matching points,
5472 so there is no point trying again.... er ....
5473
5474 .... except when the .* appears inside capturing parentheses, and there is a
5475 subsequent back reference to those parentheses. We haven't enough information
5476 to catch that case precisely.
5477
5478 At first, the best we could do was to detect when .* was in capturing brackets
5479 and the highest back reference was greater than or equal to that level.
5480 However, by keeping a bitmap of the first 31 back references, we can catch some
5481 of the more common cases more precisely.
5482
5483 Arguments:
5484   code           points to start of expression (the bracket)
5485   options        points to the options setting
5486   bracket_map    a bitmap of which brackets we are inside while testing; this
5487                   handles up to substring 31; after that we just have to take
5488                   the less precise approach
5489   backref_map    the back reference bitmap
5490
5491 Returns:     TRUE or FALSE
5492 */
5493
5494 static BOOL
5495 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5496   unsigned int backref_map)
5497 {
5498 do {
5499    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5500      options, PCRE_MULTILINE, FALSE);
5501    register int op = *scode;
5502
5503    /* Non-capturing brackets */
5504
5505    if (op == OP_BRA)
5506      {
5507      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5508      }
5509
5510    /* Capturing brackets */
5511
5512    else if (op == OP_CBRA)
5513      {
5514      int n = GET2(scode, 1+LINK_SIZE);
5515      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5516      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5517      }
5518
5519    /* Other brackets */
5520
5521    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5522      {
5523      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5524      }
5525
5526    /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5527    are or may be referenced. */
5528
5529    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5530              op == OP_TYPEPOSSTAR) &&
5531             (*options & PCRE_DOTALL) != 0)
5532      {
5533      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5534      }
5535
5536    /* Check for explicit anchoring */
5537
5538    else if (op != OP_SOD && op != OP_SOM &&
5539            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5540      return FALSE;
5541    code += GET(code, 1);
5542    }
5543 while (*code == OP_ALT);   /* Loop for each alternative */
5544 return TRUE;
5545 }
5546
5547
5548
5549 /*************************************************
5550 *         Check for starting with ^ or .*        *
5551 *************************************************/
5552
5553 /* This is called to find out if every branch starts with ^ or .* so that
5554 "first char" processing can be done to speed things up in multiline
5555 matching and for non-DOTALL patterns that start with .* (which must start at
5556 the beginning or after \n). As in the case of is_anchored() (see above), we
5557 have to take account of back references to capturing brackets that contain .*
5558 because in that case we can't make the assumption.
5559
5560 Arguments:
5561   code           points to start of expression (the bracket)
5562   bracket_map    a bitmap of which brackets we are inside while testing; this
5563                   handles up to substring 31; after that we just have to take
5564                   the less precise approach
5565   backref_map    the back reference bitmap
5566
5567 Returns:         TRUE or FALSE
5568 */
5569
5570 static BOOL
5571 is_startline(const uschar *code, unsigned int bracket_map,
5572   unsigned int backref_map)
5573 {
5574 do {
5575    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5576      NULL, 0, FALSE);
5577    register int op = *scode;
5578
5579    /* Non-capturing brackets */
5580
5581    if (op == OP_BRA)
5582      {
5583      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5584      }
5585
5586    /* Capturing brackets */
5587
5588    else if (op == OP_CBRA)
5589      {
5590      int n = GET2(scode, 1+LINK_SIZE);
5591      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5592      if (!is_startline(scode, new_map, backref_map)) return FALSE;
5593      }
5594
5595    /* Other brackets */
5596
5597    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5598      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5599
5600    /* .* means "start at start or after \n" if it isn't in brackets that
5601    may be referenced. */
5602
5603    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5604      {
5605      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5606      }
5607
5608    /* Check for explicit circumflex */
5609
5610    else if (op != OP_CIRC) return FALSE;
5611
5612    /* Move on to the next alternative */
5613
5614    code += GET(code, 1);
5615    }
5616 while (*code == OP_ALT);  /* Loop for each alternative */
5617 return TRUE;
5618 }
5619
5620
5621
5622 /*************************************************
5623 *       Check for asserted fixed first char      *
5624 *************************************************/
5625
5626 /* During compilation, the "first char" settings from forward assertions are
5627 discarded, because they can cause conflicts with actual literals that follow.
5628 However, if we end up without a first char setting for an unanchored pattern,
5629 it is worth scanning the regex to see if there is an initial asserted first
5630 char. If all branches start with the same asserted char, or with a bracket all
5631 of whose alternatives start with the same asserted char (recurse ad lib), then
5632 we return that char, otherwise -1.
5633
5634 Arguments:
5635   code       points to start of expression (the bracket)
5636   options    pointer to the options (used to check casing changes)
5637   inassert   TRUE if in an assertion
5638
5639 Returns:     -1 or the fixed first char
5640 */
5641
5642 static int
5643 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5644 {
5645 register int c = -1;
5646 do {
5647    int d;
5648    const uschar *scode =
5649      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5650    register int op = *scode;
5651
5652    switch(op)
5653      {
5654      default:
5655      return -1;
5656
5657      case OP_BRA:
5658      case OP_CBRA:
5659      case OP_ASSERT:
5660      case OP_ONCE:
5661      case OP_COND:
5662      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5663        return -1;
5664      if (c < 0) c = d; else if (c != d) return -1;
5665      break;
5666
5667      case OP_EXACT:       /* Fall through */
5668      scode += 2;
5669
5670      case OP_CHAR:
5671      case OP_CHARNC:
5672      case OP_PLUS:
5673      case OP_MINPLUS:
5674      case OP_POSPLUS:
5675      if (!inassert) return -1;
5676      if (c < 0)
5677        {
5678        c = scode[1];
5679        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5680        }
5681      else if (c != scode[1]) return -1;
5682      break;
5683      }
5684
5685    code += GET(code, 1);
5686    }
5687 while (*code == OP_ALT);
5688 return c;
5689 }
5690
5691
5692
5693 /*************************************************
5694 *        Compile a Regular Expression            *
5695 *************************************************/
5696
5697 /* This function takes a string and returns a pointer to a block of store
5698 holding a compiled version of the expression. The original API for this
5699 function had no error code return variable; it is retained for backwards
5700 compatibility. The new function is given a new name.
5701
5702 Arguments:
5703   pattern       the regular expression
5704   options       various option bits
5705   errorcodeptr  pointer to error code variable (pcre_compile2() only)
5706                   can be NULL if you don't want a code value
5707   errorptr      pointer to pointer to error text
5708   erroroffset   ptr offset in pattern where error was detected
5709   tables        pointer to character tables or NULL
5710
5711 Returns:        pointer to compiled data block, or NULL on error,
5712                 with errorptr and erroroffset set
5713 */
5714
5715 PCRE_EXP_DEFN pcre *
5716 pcre_compile(const char *pattern, int options, const char **errorptr,
5717   int *erroroffset, const unsigned char *tables)
5718 {
5719 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5720 }
5721
5722
5723 PCRE_EXP_DEFN pcre *
5724 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5725   const char **errorptr, int *erroroffset, const unsigned char *tables)
5726 {
5727 real_pcre *re;
5728 int length = 1;  /* For final END opcode */
5729 int firstbyte, reqbyte, newline;
5730 int errorcode = 0;
5731 int skipatstart = 0;
5732 #ifdef SUPPORT_UTF8
5733 BOOL utf8;
5734 #endif
5735 size_t size;
5736 uschar *code;
5737 const uschar *codestart;
5738 const uschar *ptr;
5739 compile_data compile_block;
5740 compile_data *cd = &compile_block;
5741
5742 /* This space is used for "compiling" into during the first phase, when we are
5743 computing the amount of memory that is needed. Compiled items are thrown away
5744 as soon as possible, so that a fairly large buffer should be sufficient for
5745 this purpose. The same space is used in the second phase for remembering where
5746 to fill in forward references to subpatterns. */
5747
5748 uschar cworkspace[COMPILE_WORK_SIZE];
5749
5750
5751 /* Set this early so that early errors get offset 0. */
5752
5753 ptr = (const uschar *)pattern;
5754
5755 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5756 can do is just return NULL, but we can set a code value if there is a code
5757 pointer. */
5758
5759 if (errorptr == NULL)
5760   {
5761   if (errorcodeptr != NULL) *errorcodeptr = 99;
5762   return NULL;
5763   }
5764
5765 *errorptr = NULL;
5766 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5767
5768 /* However, we can give a message for this error */
5769
5770 if (erroroffset == NULL)
5771   {
5772   errorcode = ERR16;
5773   goto PCRE_EARLY_ERROR_RETURN2;
5774   }
5775
5776 *erroroffset = 0;
5777
5778 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5779
5780 #ifdef SUPPORT_UTF8
5781 utf8 = (options & PCRE_UTF8) != 0;
5782 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5783      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5784   {
5785   errorcode = ERR44;
5786   goto PCRE_EARLY_ERROR_RETURN2;
5787   }
5788 #else
5789 if ((options & PCRE_UTF8) != 0)
5790   {
5791   errorcode = ERR32;
5792   goto PCRE_EARLY_ERROR_RETURN;
5793   }
5794 #endif
5795
5796 if ((options & ~PUBLIC_OPTIONS) != 0)
5797   {
5798   errorcode = ERR17;
5799   goto PCRE_EARLY_ERROR_RETURN;
5800   }
5801
5802 /* Set up pointers to the individual character tables */
5803
5804 if (tables == NULL) tables = _pcre_default_tables;
5805 cd->lcc = tables + lcc_offset;
5806 cd->fcc = tables + fcc_offset;
5807 cd->cbits = tables + cbits_offset;
5808 cd->ctypes = tables + ctypes_offset;
5809
5810 /* Check for global one-time settings at the start of the pattern, and remember
5811 the offset for later. */
5812
5813 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5814   {
5815   int newnl = 0;
5816   int newbsr = 0;
5817
5818   if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5819     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5820   else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
5821     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5822   else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
5823     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5824   else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5825     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5826   else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
5827     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5828
5829   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5830     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5831   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5832     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5833
5834   if (newnl != 0)
5835     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5836   else if (newbsr != 0)
5837     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5838   else break;
5839   }
5840
5841 /* Check validity of \R options. */
5842
5843 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5844   {
5845   case 0:
5846   case PCRE_BSR_ANYCRLF:
5847   case PCRE_BSR_UNICODE:
5848   break;
5849   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5850   }
5851
5852 /* Handle different types of newline. The three bits give seven cases. The
5853 current code allows for fixed one- or two-byte sequences, plus "any" and
5854 "anycrlf". */
5855
5856 switch (options & PCRE_NEWLINE_BITS)
5857   {
5858   case 0: newline = NEWLINE; break;   /* Build-time default */
5859   case PCRE_NEWLINE_CR: newline = '\r'; break;
5860   case PCRE_NEWLINE_LF: newline = '\n'; break;
5861   case PCRE_NEWLINE_CR+
5862        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5863   case PCRE_NEWLINE_ANY: newline = -1; break;
5864   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5865   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5866   }
5867
5868 if (newline == -2)
5869   {
5870   cd->nltype = NLTYPE_ANYCRLF;
5871   }
5872 else if (newline < 0)
5873   {
5874   cd->nltype = NLTYPE_ANY;
5875   }
5876 else
5877   {
5878   cd->nltype = NLTYPE_FIXED;
5879   if (newline > 255)
5880     {
5881     cd->nllen = 2;
5882     cd->nl[0] = (newline >> 8) & 255;
5883     cd->nl[1] = newline & 255;
5884     }
5885   else
5886     {
5887     cd->nllen = 1;
5888     cd->nl[0] = newline;
5889     }
5890   }
5891
5892 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5893 references to help in deciding whether (.*) can be treated as anchored or not.
5894 */
5895
5896 cd->top_backref = 0;
5897 cd->backref_map = 0;
5898
5899 /* Reflect pattern for debugging output */
5900
5901 DPRINTF(("------------------------------------------------------------------\n"));
5902 DPRINTF(("%s\n", pattern));
5903
5904 /* Pretend to compile the pattern while actually just accumulating the length
5905 of memory required. This behaviour is triggered by passing a non-NULL final
5906 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5907 to compile parts of the pattern into; the compiled code is discarded when it is
5908 no longer needed, so hopefully this workspace will never overflow, though there
5909 is a test for its doing so. */
5910
5911 cd->bracount = 0;
5912 cd->names_found = 0;
5913 cd->name_entry_size = 0;
5914 cd->name_table = NULL;
5915 cd->start_workspace = cworkspace;
5916 cd->start_code = cworkspace;
5917 cd->hwm = cworkspace;
5918 cd->start_pattern = (const uschar *)pattern;
5919 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5920 cd->req_varyopt = 0;
5921 cd->external_options = options;
5922 cd->external_flags = 0;
5923
5924 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5925 don't need to look at the result of the function here. The initial options have
5926 been put into the cd block so that they can be changed if an option setting is
5927 found within the regex right at the beginning. Bringing initial option settings
5928 outside can help speed up starting point checks. */
5929
5930 ptr += skipatstart;
5931 code = cworkspace;
5932 *code = OP_BRA;
5933 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5934   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5935   &length);
5936 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5937
5938 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5939   cd->hwm - cworkspace));
5940
5941 if (length > MAX_PATTERN_SIZE)
5942   {
5943   errorcode = ERR20;
5944   goto PCRE_EARLY_ERROR_RETURN;
5945   }
5946
5947 /* Compute the size of data block needed and get it, either from malloc or
5948 externally provided function. Integer overflow should no longer be possible
5949 because nowadays we limit the maximum value of cd->names_found and
5950 cd->name_entry_size. */
5951
5952 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5953 re = (real_pcre *)(pcre_malloc)(size);
5954
5955 if (re == NULL)
5956   {
5957   errorcode = ERR21;
5958   goto PCRE_EARLY_ERROR_RETURN;
5959   }
5960
5961 /* Put in the magic number, and save the sizes, initial options, internal
5962 flags, and character table pointer. NULL is used for the default character
5963 tables. The nullpad field is at the end; it's there to help in the case when a
5964 regex compiled on a system with 4-byte pointers is run on another with 8-byte
5965 pointers. */
5966
5967 re->magic_number = MAGIC_NUMBER;
5968 re->size = size;
5969 re->options = cd->external_options;
5970 re->flags = cd->external_flags;
5971 re->dummy1 = 0;
5972 re->first_byte = 0;
5973 re->req_byte = 0;
5974 re->name_table_offset = sizeof(real_pcre);
5975 re->name_entry_size = cd->name_entry_size;
5976 re->name_count = cd->names_found;
5977 re->ref_count = 0;
5978 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5979 re->nullpad = NULL;
5980
5981 /* The starting points of the name/number translation table and of the code are
5982 passed around in the compile data block. The start/end pattern and initial
5983 options are already set from the pre-compile phase, as is the name_entry_size
5984 field. Reset the bracket count and the names_found field. Also reset the hwm
5985 field; this time it's used for remembering forward references to subpatterns.
5986 */
5987
5988 cd->bracount = 0;
5989 cd->names_found = 0;
5990 cd->name_table = (uschar *)re + re->name_table_offset;
5991 codestart = cd->name_table + re->name_entry_size * re->name_count;
5992 cd->start_code = codestart;
5993 cd->hwm = cworkspace;
5994 cd->req_varyopt = 0;
5995 cd->had_accept = FALSE;
5996
5997 /* Set up a starting, non-extracting bracket, then compile the expression. On
5998 error, errorcode will be set non-zero, so we don't need to look at the result
5999 of the function here. */
6000
6001 ptr = (const uschar *)pattern + skipatstart;
6002 code = (uschar *)codestart;
6003 *code = OP_BRA;
6004 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6005   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6006 re->top_bracket = cd->bracount;
6007 re->top_backref = cd->top_backref;
6008 re->flags = cd->external_flags;
6009
6010 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6011
6012 /* If not reached end of pattern on success, there's an excess bracket. */
6013
6014 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6015
6016 /* Fill in the terminating state and check for disastrous overflow, but
6017 if debugging, leave the test till after things are printed out. */
6018
6019 *code++ = OP_END;
6020
6021 #ifndef DEBUG
6022 if (code - codestart > length) errorcode = ERR23;
6023 #endif
6024
6025 /* Fill in any forward references that are required. */
6026
6027 while (errorcode == 0 && cd->hwm > cworkspace)
6028   {
6029   int offset, recno;
6030   const uschar *groupptr;
6031   cd->hwm -= LINK_SIZE;
6032   offset = GET(cd->hwm, 0);
6033   recno = GET(codestart, offset);
6034   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6035   if (groupptr == NULL) errorcode = ERR53;
6036     else PUT(((uschar *)codestart), offset, groupptr - codestart);
6037   }
6038
6039 /* Give an error if there's back reference to a non-existent capturing
6040 subpattern. */
6041
6042 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6043
6044 /* Failed to compile, or error while post-processing */
6045
6046 if (errorcode != 0)
6047   {
6048   (pcre_free)(re);
6049   PCRE_EARLY_ERROR_RETURN:
6050   *erroroffset = ptr - (const uschar *)pattern;
6051   PCRE_EARLY_ERROR_RETURN2:
6052   *errorptr = find_error_text(errorcode);
6053   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6054   return NULL;
6055   }
6056
6057 /* If the anchored option was not passed, set the flag if we can determine that
6058 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6059 as starting with .* when DOTALL is set).
6060
6061 Otherwise, if we know what the first byte has to be, save it, because that
6062 speeds up unanchored matches no end. If not, see if we can set the
6063 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6064 start with ^. and also when all branches start with .* for non-DOTALL matches.
6065 */
6066
6067 if ((re->options & PCRE_ANCHORED) == 0)
6068   {
6069   int temp_options = re->options;   /* May get changed during these scans */
6070   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6071     re->options |= PCRE_ANCHORED;
6072   else
6073     {
6074     if (firstbyte < 0)
6075       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6076     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6077       {
6078       int ch = firstbyte & 255;
6079       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6080          cd->fcc[ch] == ch)? ch : firstbyte;
6081       re->flags |= PCRE_FIRSTSET;
6082       }
6083     else if (is_startline(codestart, 0, cd->backref_map))
6084       re->flags |= PCRE_STARTLINE;
6085     }
6086   }
6087
6088 /* For an anchored pattern, we use the "required byte" only if it follows a
6089 variable length item in the regex. Remove the caseless flag for non-caseable
6090 bytes. */
6091
6092 if (reqbyte >= 0 &&
6093      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6094   {
6095   int ch = reqbyte & 255;
6096   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6097     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6098   re->flags |= PCRE_REQCHSET;
6099   }
6100
6101 /* Print out the compiled data if debugging is enabled. This is never the
6102 case when building a production library. */
6103
6104 #ifdef DEBUG
6105
6106 printf("Length = %d top_bracket = %d top_backref = %d\n",
6107   length, re->top_bracket, re->top_backref);
6108
6109 printf("Options=%08x\n", re->options);
6110
6111 if ((re->flags & PCRE_FIRSTSET) != 0)
6112   {
6113   int ch = re->first_byte & 255;
6114   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6115     "" : " (caseless)";
6116   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6117     else printf("First char = \\x%02x%s\n", ch, caseless);
6118   }
6119
6120 if ((re->flags & PCRE_REQCHSET) != 0)
6121   {
6122   int ch = re->req_byte & 255;
6123   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6124     "" : " (caseless)";
6125   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6126     else printf("Req char = \\x%02x%s\n", ch, caseless);
6127   }
6128
6129 pcre_printint(re, stdout, TRUE);
6130
6131 /* This check is done here in the debugging case so that the code that
6132 was compiled can be seen. */
6133
6134 if (code - codestart > length)
6135   {
6136   (pcre_free)(re);
6137   *errorptr = find_error_text(ERR23);
6138   *erroroffset = ptr - (uschar *)pattern;
6139   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6140   return NULL;
6141   }
6142 #endif   /* DEBUG */
6143
6144 return (pcre *)re;
6145 }
6146
6147 /* End of pcre_compile.c */