1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2005 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
47 #include "pcre_internal.h"
50 /*************************************************
51 * Code parameters and static tables *
52 *************************************************/
54 /* Maximum number of items on the nested bracket stacks at compile time. This
55 applies to the nesting of all kinds of parentheses. It does not limit
56 un-nested, non-capturing parentheses. This number can be made bigger if
57 necessary - it is used to dimension one int and one unsigned char vector at
60 #define BRASTACK_SIZE 200
63 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
64 are simple data values; negative values are for special things like \d and so
65 on. Zero means further processing is needed (for things like \x), or the escape
68 #if !EBCDIC /* This is the "normal" table for ASCII systems */
69 static const short int escapes[] = {
70 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
71 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
72 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
73 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
74 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
75 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
76 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
77 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
78 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
79 0, 0, -ESC_z /* x - z */
82 #else /* This is the "abnormal" table for EBCDIC systems */
83 static const short int escapes[] = {
84 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
85 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
86 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
87 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
88 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
89 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
90 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
91 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
92 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
93 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
94 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
95 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
96 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
97 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
98 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
99 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
100 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
101 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
102 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
103 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
104 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
105 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
106 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
111 /* Tables of names of POSIX character classes and their lengths. The list is
112 terminated by a zero length entry. The first three must be alpha, upper, lower,
113 as this is assumed for handling case independence. */
115 static const char *const posix_names[] = {
116 "alpha", "lower", "upper",
117 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
118 "print", "punct", "space", "word", "xdigit" };
120 static const uschar posix_name_lengths[] = {
121 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
123 /* Table of class bit maps for each POSIX class; up to three may be combined
124 to form the class. The table for [:blank:] is dynamically modified to remove
125 the vertical space characters. */
127 static const int posix_class_maps[] = {
128 cbit_lower, cbit_upper, -1, /* alpha */
129 cbit_lower, -1, -1, /* lower */
130 cbit_upper, -1, -1, /* upper */
131 cbit_digit, cbit_lower, cbit_upper, /* alnum */
132 cbit_print, cbit_cntrl, -1, /* ascii */
133 cbit_space, -1, -1, /* blank - a GNU extension */
134 cbit_cntrl, -1, -1, /* cntrl */
135 cbit_digit, -1, -1, /* digit */
136 cbit_graph, -1, -1, /* graph */
137 cbit_print, -1, -1, /* print */
138 cbit_punct, -1, -1, /* punct */
139 cbit_space, -1, -1, /* space */
140 cbit_word, -1, -1, /* word - a Perl extension */
141 cbit_xdigit,-1, -1 /* xdigit */
145 /* The texts of compile-time error messages. These are "char *" because they
146 are passed to the outside world. */
148 static const char *error_texts[] = {
150 "\\ at end of pattern",
151 "\\c at end of pattern",
152 "unrecognized character follows \\",
153 "numbers out of order in {} quantifier",
155 "number too big in {} quantifier",
156 "missing terminating ] for character class",
157 "invalid escape sequence in character class",
158 "range out of order in character class",
161 "operand of unlimited repeat could match the empty string",
162 "internal error: unexpected repeat",
163 "unrecognized character after (?",
164 "POSIX named classes are supported only within a class",
167 "reference to non-existent subpattern",
168 "erroffset passed as NULL",
169 "unknown option bit(s) set",
170 "missing ) after comment",
171 "parentheses nested too deeply",
173 "regular expression too large",
174 "failed to get memory",
175 "unmatched parentheses",
176 "internal error: code overflow",
177 "unrecognized character after (?<",
179 "lookbehind assertion is not fixed length",
180 "malformed number after (?(",
181 "conditional group contains more than two branches",
182 "assertion expected after (?(",
183 "(?R or (?digits must be followed by )",
185 "unknown POSIX class name",
186 "POSIX collating elements are not supported",
187 "this version of PCRE is not compiled with PCRE_UTF8 support",
189 "character value in \\x{...} sequence is too large",
191 "invalid condition (?(0)",
192 "\\C not allowed in lookbehind assertion",
193 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
194 "number after (?C is > 255",
195 "closing ) for (?C expected",
197 "recursive call could loop indefinitely",
198 "unrecognized character after (?P",
199 "syntax error after (?P",
200 "two named groups have the same name",
201 "invalid UTF-8 string",
203 "support for \\P, \\p, and \\X has not been compiled",
204 "malformed \\P or \\p sequence",
205 "unknown property name after \\P or \\p"
209 /* Table to identify digits and hex digits. This is used when compiling
210 patterns. Note that the tables in chartables are dependent on the locale, and
211 may mark arbitrary characters as digits - but the PCRE compiling code expects
212 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
213 a private table here. It costs 256 bytes, but it is a lot faster than doing
214 character value tests (at least in some simple cases I timed), and in some
215 applications one wants PCRE to compile efficiently as well as match
218 For convenience, we use the same bit definitions as in chartables:
221 0x08 hexadecimal digit
223 Then we can use ctype_digit and ctype_xdigit in the code. */
225 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
226 static const unsigned char digitab[] =
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
234 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
235 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
236 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
237 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
240 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
261 #else /* This is the "abnormal" case, for EBCDIC systems */
262 static const unsigned char digitab[] =
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
280 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
288 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
294 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
295 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
297 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
298 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
299 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
300 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
302 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
306 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
307 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
309 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
311 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
314 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
315 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
316 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
317 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
318 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
319 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
320 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
321 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
322 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
323 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
324 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
325 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
326 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
327 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
328 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
329 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
333 /* Definition to allow mutual recursion */
336 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
337 int *, int *, branch_chain *, compile_data *);
341 /*************************************************
343 *************************************************/
345 /* This function is called when a \ has been encountered. It either returns a
346 positive value for a simple escape such as \n, or a negative value which
347 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
348 a positive value greater than 255 may be returned. On entry, ptr is pointing at
349 the \. On exit, it is on the final character of the escape sequence.
352 ptrptr points to the pattern position pointer
353 errorcodeptr points to the errorcode variable
354 bracount number of previous extracting brackets
355 options the options bits
356 isclass TRUE if inside a character class
358 Returns: zero or positive => a data character
359 negative => a special escape sequence
360 on error, errorptr is set
364 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
365 int options, BOOL isclass)
367 const uschar *ptr = *ptrptr;
370 /* If backslash is at the end of the pattern, it's an error. */
373 if (c == 0) *errorcodeptr = ERR1;
375 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
376 a table. A non-zero result is something that can be returned immediately.
377 Otherwise further processing may be required. */
379 #if !EBCDIC /* ASCII coding */
380 else if (c < '0' || c > 'z') {} /* Not alphameric */
381 else if ((i = escapes[c - '0']) != 0) c = i;
383 #else /* EBCDIC coding */
384 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
385 else if ((i = escapes[c - 0x48]) != 0) c = i;
388 /* Escapes that need further processing, or are illegal. */
392 const uschar *oldptr;
395 /* A number of Perl escapes are not handled by PCRE. We give an explicit
403 *errorcodeptr = ERR37;
406 /* The handling of escape sequences consisting of a string of digits
407 starting with one that is not zero is not straightforward. By experiment,
408 the way Perl works seems to be as follows:
410 Outside a character class, the digits are read as a decimal number. If the
411 number is less than 10, or if there are that many previous extracting
412 left brackets, then it is a back reference. Otherwise, up to three octal
413 digits are read to form an escaped byte. Thus \123 is likely to be octal
414 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
415 value is greater than 377, the least significant 8 bits are taken. Inside a
416 character class, \ followed by a digit is always an octal number. */
418 case '1': case '2': case '3': case '4': case '5':
419 case '6': case '7': case '8': case '9':
425 while ((digitab[ptr[1]] & ctype_digit) != 0)
426 c = c * 10 + *(++ptr) - '0';
427 if (c < 10 || c <= bracount)
432 ptr = oldptr; /* Put the pointer back and fall through */
435 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
436 generates a binary zero byte and treats the digit as a following literal.
437 Thus we have to pull back the pointer by one. */
439 if ((c = *ptr) >= '8')
446 /* \0 always starts an octal number, but we may drop through to here with a
447 larger first octal digit. */
451 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
452 c = c * 8 + *(++ptr) - '0';
453 c &= 255; /* Take least significant 8 bits */
456 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
457 which can be greater than 0xff, but only if the ddd are hex digits. */
461 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
463 const uschar *pt = ptr + 2;
464 register int count = 0;
466 while ((digitab[*pt] & ctype_xdigit) != 0)
470 #if !EBCDIC /* ASCII coding */
471 if (cc >= 'a') cc -= 32; /* Convert to upper case */
472 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
473 #else /* EBCDIC coding */
474 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
475 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
480 if (c < 0 || count > 8) *errorcodeptr = ERR34;
484 /* If the sequence of hex digits does not end with '}', then we don't
485 recognize this construct; fall through to the normal \x handling. */
489 /* Read just a single hex char */
492 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
494 int cc; /* Some compilers don't like ++ */
495 cc = *(++ptr); /* in initializers */
496 #if !EBCDIC /* ASCII coding */
497 if (cc >= 'a') cc -= 32; /* Convert to upper case */
498 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
499 #else /* EBCDIC coding */
500 if (cc <= 'z') cc += 64; /* Convert to upper case */
501 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
506 /* Other special escapes not starting with a digit are straightforward */
512 *errorcodeptr = ERR2;
516 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
517 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
518 (However, an EBCDIC equivalent has now been added.) */
520 #if !EBCDIC /* ASCII coding */
521 if (c >= 'a' && c <= 'z') c -= 32;
523 #else /* EBCDIC coding */
524 if (c >= 'a' && c <= 'z') c += 64;
529 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
530 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
531 for Perl compatibility, it is a literal. This code looks a bit odd, but
532 there used to be some cases other than the default, and there may be again
533 in future, so I haven't "optimized" it. */
536 if ((options & PCRE_EXTRA) != 0) switch(c)
539 *errorcodeptr = ERR3;
553 /*************************************************
555 *************************************************/
557 /* This function is called after \P or \p has been encountered, provided that
558 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
559 pointing at the P or p. On exit, it is pointing at the final character of the
563 ptrptr points to the pattern position pointer
564 negptr points to a boolean that is set TRUE for negation else FALSE
565 errorcodeptr points to the error code variable
567 Returns: value from ucp_type_table, or -1 for an invalid type
571 get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
574 const uschar *ptr = *ptrptr;
578 if (c == 0) goto ERROR_RETURN;
582 /* \P or \p can be followed by a one- or two-character name in {}, optionally
583 preceded by ^ for negation. */
592 for (i = 0; i <= 2; i++)
595 if (c == 0) goto ERROR_RETURN;
599 if (c !='}') /* Try to distinguish error cases */
601 while (*(++ptr) != 0 && *ptr != '}');
602 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
607 /* Otherwise there is just one following character */
617 /* Search for a recognized property name using binary chop */
620 top = _pcre_utt_size;
625 c = strcmp(name, _pcre_utt[i].name);
626 if (c == 0) return _pcre_utt[i].value;
627 if (c > 0) bot = i + 1; else top = i;
631 *errorcodeptr = ERR47;
636 *errorcodeptr = ERR46;
645 /*************************************************
646 * Check for counted repeat *
647 *************************************************/
649 /* This function is called when a '{' is encountered in a place where it might
650 start a quantifier. It looks ahead to see if it really is a quantifier or not.
651 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
652 where the ddds are digits.
655 p pointer to the first char after '{'
657 Returns: TRUE or FALSE
661 is_counted_repeat(const uschar *p)
663 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
664 while ((digitab[*p] & ctype_digit) != 0) p++;
665 if (*p == '}') return TRUE;
667 if (*p++ != ',') return FALSE;
668 if (*p == '}') return TRUE;
670 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
671 while ((digitab[*p] & ctype_digit) != 0) p++;
678 /*************************************************
679 * Read repeat counts *
680 *************************************************/
682 /* Read an item of the form {n,m} and return the values. This is called only
683 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
684 so the syntax is guaranteed to be correct, but we need to check the values.
687 p pointer to first char after '{'
688 minp pointer to int for min
689 maxp pointer to int for max
690 returned as -1 if no max
691 errorcodeptr points to error code variable
693 Returns: pointer to '}' on success;
694 current ptr on error, with errorcodeptr set non-zero
697 static const uschar *
698 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
703 /* Read the minimum value and do a paranoid check: a negative value indicates
704 an integer overflow. */
706 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
707 if (min < 0 || min > 65535)
709 *errorcodeptr = ERR5;
713 /* Read the maximum value if there is one, and again do a paranoid on its size.
714 Also, max must not be less than min. */
716 if (*p == '}') max = min; else
721 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
722 if (max < 0 || max > 65535)
724 *errorcodeptr = ERR5;
729 *errorcodeptr = ERR4;
735 /* Fill in the required variables, and pass back the pointer to the terminating
745 /*************************************************
746 * Find first significant op code *
747 *************************************************/
749 /* This is called by several functions that scan a compiled expression looking
750 for a fixed first character, or an anchoring op code etc. It skips over things
751 that do not influence this. For some calls, a change of option is important.
752 For some calls, it makes sense to skip negative forward and all backward
753 assertions, and also the \b assertion; for others it does not.
756 code pointer to the start of the group
757 options pointer to external options
758 optbit the option bit whose changing is significant, or
760 skipassert TRUE if certain assertions are to be skipped
762 Returns: pointer to the first significant opcode
766 first_significant_code(const uschar *code, int *options, int optbit,
774 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
775 *options = (int)code[1];
781 case OP_ASSERTBACK_NOT:
782 if (!skipassert) return code;
783 do code += GET(code, 1); while (*code == OP_ALT);
784 code += _pcre_OP_lengths[*code];
787 case OP_WORD_BOUNDARY:
788 case OP_NOT_WORD_BOUNDARY:
789 if (!skipassert) return code;
795 code += _pcre_OP_lengths[*code];
802 /* Control never reaches here */
808 /*************************************************
809 * Find the fixed length of a pattern *
810 *************************************************/
812 /* Scan a pattern and compute the fixed length of subject that will match it,
813 if the length is fixed. This is needed for dealing with backward assertions.
814 In UTF8 mode, the result is in characters rather than bytes.
817 code points to the start of the pattern (the bracket)
818 options the compiling options
820 Returns: the fixed length, or -1 if there is no fixed length,
821 or -2 if \C was encountered
825 find_fixedlength(uschar *code, int options)
829 register int branchlength = 0;
830 register uschar *cc = code + 1 + LINK_SIZE;
832 /* Scan along the opcodes for this branch. If we get to the end of the
833 branch, check the length against that of the other branches. */
838 register int op = *cc;
839 if (op >= OP_BRA) op = OP_BRA;
846 d = find_fixedlength(cc, options);
849 do cc += GET(cc, 1); while (*cc == OP_ALT);
853 /* Reached end of a branch; if it's a ket it is the end of a nested
854 call. If it's ALT it is an alternation in a nested call. If it is
855 END it's the end of the outer call. All can be handled by the same code. */
862 if (length < 0) length = branchlength;
863 else if (length != branchlength) return -1;
864 if (*cc != OP_ALT) return length;
869 /* Skip over assertive subpatterns */
874 case OP_ASSERTBACK_NOT:
875 do cc += GET(cc, 1); while (*cc == OP_ALT);
878 /* Skip over things that don't match chars */
891 case OP_NOT_WORD_BOUNDARY:
892 case OP_WORD_BOUNDARY:
893 cc += _pcre_OP_lengths[*cc];
896 /* Handle literal characters */
903 if ((options & PCRE_UTF8) != 0)
905 while ((*cc & 0xc0) == 0x80) cc++;
910 /* Handle exact repetitions. The count is already in characters, but we
911 need to skip over a multibyte character in UTF8 mode. */
914 branchlength += GET2(cc,1);
917 if ((options & PCRE_UTF8) != 0)
919 while((*cc & 0x80) == 0x80) cc++;
925 branchlength += GET2(cc,1);
929 /* Handle single-char matchers */
938 case OP_NOT_WHITESPACE:
940 case OP_NOT_WORDCHAR:
947 /* The single-byte matcher isn't allowed */
952 /* Check a class for variable quantification */
956 cc += GET(cc, 1) - 33;
974 if (GET2(cc,1) != GET2(cc,3)) return -1;
975 branchlength += GET2(cc,1);
984 /* Anything else is variable length */
990 /* Control never gets here */
996 /*************************************************
997 * Scan compiled regex for numbered bracket *
998 *************************************************/
1000 /* This little function scans through a compiled pattern until it finds a
1001 capturing bracket with the given number.
1004 code points to start of expression
1005 utf8 TRUE in UTF-8 mode
1006 number the required bracket number
1008 Returns: pointer to the opcode for the bracket, or NULL if not found
1011 static const uschar *
1012 find_bracket(const uschar *code, BOOL utf8, int number)
1014 #ifndef SUPPORT_UTF8
1015 utf8 = utf8; /* Stop pedantic compilers complaining */
1020 register int c = *code;
1021 if (c == OP_END) return NULL;
1022 else if (c > OP_BRA)
1025 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1026 if (n == number) return (uschar *)code;
1027 code += _pcre_OP_lengths[OP_BRA];
1031 code += _pcre_OP_lengths[c];
1035 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1036 by a multi-byte character. The length in the table is a minimum, so we have
1037 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1038 can use relatively efficient code. */
1053 while ((*code & 0xc0) == 0x80) code++;
1056 /* XCLASS is used for classes that cannot be represented just by a bit
1057 map. This includes negated single high-valued characters. The length in
1058 the table is zero; the actual length is stored in the compiled code. */
1061 code += GET(code, 1) + 1;
1071 /*************************************************
1072 * Scan compiled regex for recursion reference *
1073 *************************************************/
1075 /* This little function scans through a compiled pattern until it finds an
1076 instance of OP_RECURSE.
1079 code points to start of expression
1080 utf8 TRUE in UTF-8 mode
1082 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1085 static const uschar *
1086 find_recurse(const uschar *code, BOOL utf8)
1088 #ifndef SUPPORT_UTF8
1089 utf8 = utf8; /* Stop pedantic compilers complaining */
1094 register int c = *code;
1095 if (c == OP_END) return NULL;
1096 else if (c == OP_RECURSE) return code;
1097 else if (c > OP_BRA)
1099 code += _pcre_OP_lengths[OP_BRA];
1103 code += _pcre_OP_lengths[c];
1107 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1108 by a multi-byte character. The length in the table is a minimum, so we have
1109 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1110 can use relatively efficient code. */
1125 while ((*code & 0xc0) == 0x80) code++;
1128 /* XCLASS is used for classes that cannot be represented just by a bit
1129 map. This includes negated single high-valued characters. The length in
1130 the table is zero; the actual length is stored in the compiled code. */
1133 code += GET(code, 1) + 1;
1143 /*************************************************
1144 * Scan compiled branch for non-emptiness *
1145 *************************************************/
1147 /* This function scans through a branch of a compiled pattern to see whether it
1148 can match the empty string or not. It is called only from could_be_empty()
1149 below. Note that first_significant_code() skips over assertions. If we hit an
1150 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1151 whose current branch will already have been scanned.
1154 code points to start of search
1155 endcode points to where to stop
1156 utf8 TRUE if in UTF8 mode
1158 Returns: TRUE if what is matched could be empty
1162 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1165 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1167 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1169 const uschar *ccode;
1176 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1178 /* Scan a closed bracket */
1180 empty_branch = FALSE;
1183 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1184 empty_branch = TRUE;
1185 code += GET(code, 1);
1187 while (*code == OP_ALT);
1188 if (!empty_branch) return FALSE; /* All branches are non-empty */
1189 code += 1 + LINK_SIZE;
1195 /* Check for quantifiers after a class */
1199 ccode = code + GET(code, 1);
1200 goto CHECK_CLASS_REPEAT;
1213 case OP_CRSTAR: /* These could be empty; continue */
1219 default: /* Non-repeat => class must match */
1220 case OP_CRPLUS: /* These repeats aren't empty */
1226 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1231 /* Opcodes that must match a character */
1238 case OP_NOT_WHITESPACE:
1240 case OP_NOT_WORDCHAR:
1254 case OP_TYPEMINPLUS:
1266 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1267 followed by a multibyte character */
1276 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1287 /*************************************************
1288 * Scan compiled regex for non-emptiness *
1289 *************************************************/
1291 /* This function is called to check for left recursive calls. We want to check
1292 the current branch of the current pattern to see if it could match the empty
1293 string. If it could, we must look outwards for branches at other levels,
1294 stopping when we pass beyond the bracket which is the subject of the recursion.
1297 code points to start of the recursion
1298 endcode points to where to stop (current RECURSE item)
1299 bcptr points to the chain of current (unclosed) branch starts
1300 utf8 TRUE if in UTF-8 mode
1302 Returns: TRUE if what is matched could be empty
1306 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1309 while (bcptr != NULL && bcptr->current >= code)
1311 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1312 bcptr = bcptr->outer;
1319 /*************************************************
1320 * Check for POSIX class syntax *
1321 *************************************************/
1323 /* This function is called when the sequence "[:" or "[." or "[=" is
1324 encountered in a character class. It checks whether this is followed by an
1325 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1329 ptr pointer to the initial [
1330 endptr where to return the end pointer
1331 cd pointer to compile data
1333 Returns: TRUE or FALSE
1337 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1339 int terminator; /* Don't combine these lines; the Solaris cc */
1340 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1341 if (*(++ptr) == '^') ptr++;
1342 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1343 if (*ptr == terminator && ptr[1] == ']')
1354 /*************************************************
1355 * Check POSIX class name *
1356 *************************************************/
1358 /* This function is called to check the name given in a POSIX-style class entry
1362 ptr points to the first letter
1363 len the length of the name
1365 Returns: a value representing the name, or -1 if unknown
1369 check_posix_name(const uschar *ptr, int len)
1371 register int yield = 0;
1372 while (posix_name_lengths[yield] != 0)
1374 if (len == posix_name_lengths[yield] &&
1375 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1382 /*************************************************
1383 * Adjust OP_RECURSE items in repeated group *
1384 *************************************************/
1386 /* OP_RECURSE items contain an offset from the start of the regex to the group
1387 that is referenced. This means that groups can be replicated for fixed
1388 repetition simply by copying (because the recursion is allowed to refer to
1389 earlier groups that are outside the current group). However, when a group is
1390 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1391 it, after it has been compiled. This means that any OP_RECURSE items within it
1392 that refer to the group itself or any contained groups have to have their
1393 offsets adjusted. That is the job of this function. Before it is called, the
1394 partially compiled regex must be temporarily terminated with OP_END.
1397 group points to the start of the group
1398 adjust the amount by which the group is to be moved
1399 utf8 TRUE in UTF-8 mode
1400 cd contains pointers to tables etc.
1406 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1408 uschar *ptr = group;
1409 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1411 int offset = GET(ptr, 1);
1412 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1413 ptr += 1 + LINK_SIZE;
1419 /*************************************************
1420 * Insert an automatic callout point *
1421 *************************************************/
1423 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1424 callout points before each pattern item.
1427 code current code pointer
1428 ptr current pattern pointer
1429 cd pointers to tables etc
1431 Returns: new code pointer
1435 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1437 *code++ = OP_CALLOUT;
1439 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1440 PUT(code, LINK_SIZE, 0); /* Default length */
1441 return code + 2*LINK_SIZE;
1446 /*************************************************
1447 * Complete a callout item *
1448 *************************************************/
1450 /* A callout item contains the length of the next item in the pattern, which
1451 we can't fill in till after we have reached the relevant point. This is used
1452 for both automatic and manual callouts.
1455 previous_callout points to previous callout item
1456 ptr current pattern pointer
1457 cd pointers to tables etc
1463 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1465 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1466 PUT(previous_callout, 2 + LINK_SIZE, length);
1472 /*************************************************
1473 * Get othercase range *
1474 *************************************************/
1476 /* This function is passed the start and end of a class range, in UTF-8 mode
1477 with UCP support. It searches up the characters, looking for internal ranges of
1478 characters in the "other" case. Each call returns the next one, updating the
1482 cptr points to starting character value; updated
1484 ocptr where to put start of othercase range
1485 odptr where to put end of othercase range
1487 Yield: TRUE when range returned; FALSE when no more
1491 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1493 int c, chartype, othercase, next;
1495 for (c = *cptr; c <= d; c++)
1497 if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
1501 if (c > d) return FALSE;
1504 next = othercase + 1;
1506 for (++c; c <= d; c++)
1508 if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
1519 #endif /* SUPPORT_UCP */
1522 /*************************************************
1523 * Compile one branch *
1524 *************************************************/
1526 /* Scan the pattern, compiling it into the code vector. If the options are
1527 changed during the branch, the pointer is used to change the external options
1531 optionsptr pointer to the option bits
1532 brackets points to number of extracting brackets used
1533 codeptr points to the pointer to the current code point
1534 ptrptr points to the current pattern pointer
1535 errorcodeptr points to error code variable
1536 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1537 reqbyteptr set to the last literal character required, else < 0
1538 bcptr points to current branch chain
1539 cd contains pointers to tables etc.
1541 Returns: TRUE on success
1542 FALSE, with *errorcodeptr set non-zero on error
1546 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1547 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1548 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1550 int repeat_type, op_type;
1551 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1553 int greedy_default, greedy_non_default;
1554 int firstbyte, reqbyte;
1555 int zeroreqbyte, zerofirstbyte;
1556 int req_caseopt, reqvary, tempreqvary;
1558 int options = *optionsptr;
1559 int after_manual_callout = 0;
1561 register uschar *code = *codeptr;
1563 BOOL inescq = FALSE;
1564 BOOL groupsetfirstbyte = FALSE;
1565 const uschar *ptr = *ptrptr;
1566 const uschar *tempptr;
1567 uschar *previous = NULL;
1568 uschar *previous_callout = NULL;
1569 uschar classbits[32];
1573 BOOL utf8 = (options & PCRE_UTF8) != 0;
1574 uschar *class_utf8data;
1575 uschar utf8_char[6];
1580 /* Set up the default and non-default settings for greediness */
1582 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1583 greedy_non_default = greedy_default ^ 1;
1585 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1586 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1587 matches a non-fixed char first char; reqbyte just remains unset if we never
1590 When we hit a repeat whose minimum is zero, we may have to adjust these values
1591 to take the zero repeat into account. This is implemented by setting them to
1592 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1593 item types that can be repeated set these backoff variables appropriately. */
1595 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1597 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1598 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1599 value > 255. It is added into the firstbyte or reqbyte variables to record the
1600 case status of the value. This is used only for ASCII characters. */
1602 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1604 /* Switch on next character until the end of the branch */
1609 BOOL possessive_quantifier;
1611 int class_charcount;
1621 /* Next byte in the pattern */
1625 /* If in \Q...\E, check for the end; if not, we have a literal */
1627 if (inescq && c != 0)
1629 if (c == '\\' && ptr[1] == 'E')
1637 if (previous_callout != NULL)
1639 complete_callout(previous_callout, ptr, cd);
1640 previous_callout = NULL;
1642 if ((options & PCRE_AUTO_CALLOUT) != 0)
1644 previous_callout = code;
1645 code = auto_callout(code, ptr, cd);
1651 /* Fill in length of a previous callout, except when the next thing is
1654 is_quantifier = c == '*' || c == '+' || c == '?' ||
1655 (c == '{' && is_counted_repeat(ptr+1));
1657 if (!is_quantifier && previous_callout != NULL &&
1658 after_manual_callout-- <= 0)
1660 complete_callout(previous_callout, ptr, cd);
1661 previous_callout = NULL;
1664 /* In extended mode, skip white space and comments */
1666 if ((options & PCRE_EXTENDED) != 0)
1668 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1671 /* The space before the ; is to avoid a warning on a silly compiler
1672 on the Macintosh. */
1673 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1674 if (c != 0) continue; /* Else fall through to handle end of string */
1678 /* No auto callout for quantifiers. */
1680 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1682 previous_callout = code;
1683 code = auto_callout(code, ptr, cd);
1688 /* The branch terminates at end of string, |, or ). */
1693 *firstbyteptr = firstbyte;
1694 *reqbyteptr = reqbyte;
1699 /* Handle single-character metacharacters. In multiline mode, ^ disables
1700 the setting of any following char as a first character. */
1703 if ((options & PCRE_MULTILINE) != 0)
1705 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1716 /* There can never be a first char if '.' is first, whatever happens about
1717 repeats. The value of reqbyte doesn't change either. */
1720 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1721 zerofirstbyte = firstbyte;
1722 zeroreqbyte = reqbyte;
1727 /* Character classes. If the included characters are all < 255 in value, we
1728 build a 32-byte bitmap of the permitted characters, except in the special
1729 case where there is only one such character. For negated classes, we build
1730 the map as usual, then invert it at the end. However, we use a different
1731 opcode so that data characters > 255 can be handled correctly.
1733 If the class contains characters outside the 0-255 range, a different
1734 opcode is compiled. It may optionally have a bit map for characters < 256,
1735 but those above are are explicitly listed afterwards. A flag byte tells
1736 whether the bitmap is present, and whether this is a negated class or not.
1742 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1743 they are encountered at the top level, so we'll do that too. */
1745 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1746 check_posix_syntax(ptr, &tempptr, cd))
1748 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1752 /* If the first character is '^', set the negation flag and skip it. */
1754 if ((c = *(++ptr)) == '^')
1756 negate_class = TRUE;
1761 negate_class = FALSE;
1764 /* Keep a count of chars with values < 256 so that we can optimize the case
1765 of just a single character (as long as it's < 256). For higher valued UTF-8
1766 characters, we don't yet do any optimization. */
1768 class_charcount = 0;
1769 class_lastchar = -1;
1772 class_utf8 = FALSE; /* No chars >= 256 */
1773 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1776 /* Initialize the 32-char bit map to all zeros. We have to build the
1777 map in a temporary bit of store, in case the class contains only 1
1778 character (< 256), because in that case the compiled code doesn't use the
1781 memset(classbits, 0, 32 * sizeof(uschar));
1783 /* Process characters until ] is reached. By writing this as a "do" it
1784 means that an initial ] is taken as a data character. The first pass
1785 through the regex checked the overall syntax, so we don't need to be very
1786 strict here. At the start of the loop, c contains the first byte of the
1792 if (utf8 && c > 127)
1793 { /* Braces are required because the */
1794 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1798 /* Inside \Q...\E everything is literal except \E */
1802 if (c == '\\' && ptr[1] == 'E')
1808 else goto LONE_SINGLE_CHARACTER;
1811 /* Handle POSIX class names. Perl allows a negation extension of the
1812 form [:^name:]. A square bracket that doesn't match the syntax is
1813 treated as a literal. We also recognize the POSIX constructions
1814 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1818 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1819 check_posix_syntax(ptr, &tempptr, cd))
1821 BOOL local_negate = FALSE;
1823 register const uschar *cbits = cd->cbits;
1827 *errorcodeptr = ERR31;
1834 local_negate = TRUE;
1838 posix_class = check_posix_name(ptr, tempptr - ptr);
1839 if (posix_class < 0)
1841 *errorcodeptr = ERR30;
1845 /* If matching is caseless, upper and lower are converted to
1846 alpha. This relies on the fact that the class table starts with
1847 alpha, lower, upper as the first 3 entries. */
1849 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1852 /* Or into the map we are building up to 3 of the static class
1853 tables, or their negations. The [:blank:] class sets up the same
1854 chars as the [:space:] class (all white space). We remove the vertical
1855 white space chars afterwards. */
1858 for (i = 0; i < 3; i++)
1860 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
1861 int taboffset = posix_class_maps[posix_class + i];
1862 if (taboffset < 0) break;
1866 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
1868 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
1869 if (blankclass) classbits[1] |= 0x3c;
1873 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
1874 if (blankclass) classbits[1] &= ~0x3c;
1879 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1880 continue; /* End of POSIX syntax handling */
1883 /* Backslash may introduce a single character, or it may introduce one
1884 of the specials, which just set a flag. Escaped items are checked for
1885 validity in the pre-compiling pass. The sequence \b is a special case.
1886 Inside a class (and only there) it is treated as backspace. Elsewhere
1887 it marks a word boundary. Other escapes have preset maps ready to
1888 or into the one we are building. We assume they have more than one
1889 character in them, so set class_charcount bigger than one. */
1893 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1895 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1896 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1897 else if (-c == ESC_Q) /* Handle start of quoted string */
1899 if (ptr[1] == '\\' && ptr[2] == 'E')
1901 ptr += 2; /* avoid empty string */
1909 register const uschar *cbits = cd->cbits;
1910 class_charcount += 2; /* Greater than 1 is what matters */
1914 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1918 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
1922 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
1926 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
1930 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
1931 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
1935 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
1936 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
1944 int property = get_ucp(&ptr, &negated, errorcodeptr);
1945 if (property < 0) goto FAILED;
1947 *class_utf8data++ = ((-c == ESC_p) != negated)?
1948 XCL_PROP : XCL_NOTPROP;
1949 *class_utf8data++ = property;
1950 class_charcount -= 2; /* Not a < 256 character */
1955 /* Unrecognized escapes are faulted if PCRE is running in its
1956 strict mode. By default, for compatibility with Perl, they are
1957 treated as literals. */
1960 if ((options & PCRE_EXTRA) != 0)
1962 *errorcodeptr = ERR7;
1965 c = *ptr; /* The final character */
1966 class_charcount -= 2; /* Undo the default count from above */
1970 /* Fall through if we have a single character (c >= 0). This may be
1971 > 256 in UTF-8 mode. */
1973 } /* End of backslash handling */
1975 /* A single character may be followed by '-' to form a range. However,
1976 Perl does not permit ']' to be the end of the range. A '-' character
1977 here is treated as a literal. */
1979 if (ptr[1] == '-' && ptr[2] != ']')
1986 { /* Braces are required because the */
1987 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
1991 d = *ptr; /* Not UTF-8 mode */
1993 /* The second part of a range can be a single-character escape, but
1994 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1995 in such circumstances. */
1999 const uschar *oldptr = ptr;
2000 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2002 /* \b is backslash; \X is literal X; any other special means the '-'
2007 if (d == -ESC_b) d = '\b';
2008 else if (d == -ESC_X) d = 'X'; else
2011 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2016 /* The check that the two values are in the correct order happens in
2017 the pre-pass. Optimize one-character ranges */
2019 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2021 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2022 matching, we have to use an XCLASS with extra data items. Caseless
2023 matching for characters > 127 is available only if UCP support is
2027 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2031 /* With UCP support, we can find the other case equivalents of
2032 the relevant characters. There may be several ranges. Optimize how
2033 they fit with the basic range. */
2036 if ((options & PCRE_CASELESS) != 0)
2041 while (get_othercase_range(&cc, origd, &occ, &ocd))
2043 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2045 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2046 { /* if there is overlap, */
2047 c = occ; /* noting that if occ < c */
2048 continue; /* we can't have ocd > d */
2049 } /* because a subrange is */
2050 if (ocd > d && occ <= d + 1) /* always shorter than */
2051 { /* the basic range. */
2058 *class_utf8data++ = XCL_SINGLE;
2062 *class_utf8data++ = XCL_RANGE;
2063 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2065 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2068 #endif /* SUPPORT_UCP */
2070 /* Now record the original range, possibly modified for UCP caseless
2071 overlapping ranges. */
2073 *class_utf8data++ = XCL_RANGE;
2074 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2075 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2077 /* With UCP support, we are done. Without UCP support, there is no
2078 caseless matching for UTF-8 characters > 127; we can use the bit map
2079 for the smaller ones. */
2082 continue; /* With next character in the class */
2084 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2086 /* Adjust upper limit and fall through to set up the map */
2090 #endif /* SUPPORT_UCP */
2092 #endif /* SUPPORT_UTF8 */
2094 /* We use the bit map for all cases when not in UTF-8 mode; else
2095 ranges that lie entirely within 0-127 when there is UCP support; else
2096 for partial ranges without UCP support. */
2100 classbits[c/8] |= (1 << (c&7));
2101 if ((options & PCRE_CASELESS) != 0)
2103 int uc = cd->fcc[c]; /* flip case */
2104 classbits[uc/8] |= (1 << (uc&7));
2106 class_charcount++; /* in case a one-char range */
2110 continue; /* Go get the next char in the class */
2113 /* Handle a lone single character - we can get here for a normal
2114 non-escape char, or after \ that introduces a single character or for an
2115 apparent range that isn't. */
2117 LONE_SINGLE_CHARACTER:
2119 /* Handle a character that cannot go in the bit map */
2122 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2125 *class_utf8data++ = XCL_SINGLE;
2126 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2129 if ((options & PCRE_CASELESS) != 0)
2133 if (_pcre_ucp_findchar(c, &chartype, &othercase) >= 0 &&
2136 *class_utf8data++ = XCL_SINGLE;
2137 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2140 #endif /* SUPPORT_UCP */
2144 #endif /* SUPPORT_UTF8 */
2146 /* Handle a single-byte character */
2148 classbits[c/8] |= (1 << (c&7));
2149 if ((options & PCRE_CASELESS) != 0)
2151 c = cd->fcc[c]; /* flip case */
2152 classbits[c/8] |= (1 << (c&7));
2159 /* Loop until ']' reached; the check for end of string happens inside the
2160 loop. This "while" is the end of the "do" above. */
2162 while ((c = *(++ptr)) != ']' || inescq);
2164 /* If class_charcount is 1, we saw precisely one character whose value is
2165 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2166 can optimize the negative case only if there were no characters >= 128
2167 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2168 single-bytes only. This is an historical hangover. Maybe one day we can
2169 tidy these opcodes to handle multi-byte characters.
2171 The optimization throws away the bit map. We turn the item into a
2172 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2173 that OP_NOT does not support multibyte characters. In the positive case, it
2174 can cause firstbyte to be set. Otherwise, there can be no first char if
2175 this item is first, whatever repeat count may follow. In the case of
2176 reqbyte, save the previous value for reinstating. */
2179 if (class_charcount == 1 &&
2181 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2184 if (class_charcount == 1)
2187 zeroreqbyte = reqbyte;
2189 /* The OP_NOT opcode works on one-byte characters only. */
2193 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2194 zerofirstbyte = firstbyte;
2196 *code++ = class_lastchar;
2200 /* For a single, positive character, get the value into mcbuffer, and
2201 then we can handle this with the normal one-character code. */
2204 if (utf8 && class_lastchar > 127)
2205 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2209 mcbuffer[0] = class_lastchar;
2213 } /* End of 1-char optimization */
2215 /* The general case - not the one-char optimization. If this is the first
2216 thing in the branch, there can be no first char setting, whatever the
2217 repeat count. Any reqbyte setting must remain unchanged after any kind of
2220 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2221 zerofirstbyte = firstbyte;
2222 zeroreqbyte = reqbyte;
2224 /* If there are characters with values > 255, we have to compile an
2225 extended class, with its own opcode. If there are no characters < 256,
2226 we can omit the bitmap. */
2231 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2232 *code++ = OP_XCLASS;
2234 *code = negate_class? XCL_NOT : 0;
2236 /* If the map is required, install it, and move on to the end of
2239 if (class_charcount > 0)
2242 memcpy(code, classbits, 32);
2243 code = class_utf8data;
2246 /* If the map is not required, slide down the extra data. */
2250 int len = class_utf8data - (code + 33);
2251 memmove(code + 1, code + 33, len);
2255 /* Now fill in the complete length of the item */
2257 PUT(previous, 1, code - previous);
2258 break; /* End of class handling */
2262 /* If there are no characters > 255, negate the 32-byte map if necessary,
2263 and copy it into the code vector. If this is the first thing in the branch,
2264 there can be no first char setting, whatever the repeat count. Any reqbyte
2265 setting must remain unchanged after any kind of repeat. */
2269 *code++ = OP_NCLASS;
2270 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2275 memcpy(code, classbits, 32);
2280 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2281 has been tested above. */
2284 if (!is_quantifier) goto NORMAL_CHAR;
2285 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2286 if (*errorcodeptr != 0) goto FAILED;
2304 if (previous == NULL)
2306 *errorcodeptr = ERR9;
2310 if (repeat_min == 0)
2312 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2313 reqbyte = zeroreqbyte; /* Ditto */
2316 /* Remember whether this is a variable length repeat */
2318 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2320 op_type = 0; /* Default single-char op codes */
2321 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2323 /* Save start of previous item, in case we have to move it up to make space
2324 for an inserted OP_ONCE for the additional '+' extension. */
2326 tempcode = previous;
2328 /* If the next character is '+', we have a possessive quantifier. This
2329 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2330 If the next character is '?' this is a minimizing repeat, by default,
2331 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2332 repeat type to the non-default. */
2336 repeat_type = 0; /* Force greedy */
2337 possessive_quantifier = TRUE;
2340 else if (ptr[1] == '?')
2342 repeat_type = greedy_non_default;
2345 else repeat_type = greedy_default;
2347 /* If previous was a recursion, we need to wrap it inside brackets so that
2348 it can be replicated if necessary. */
2350 if (*previous == OP_RECURSE)
2352 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2353 code += 1 + LINK_SIZE;
2355 PUT(previous, 1, code - previous);
2357 PUT(code, 1, code - previous);
2358 code += 1 + LINK_SIZE;
2361 /* If previous was a character match, abolish the item and generate a
2362 repeat item instead. If a char item has a minumum of more than one, ensure
2363 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2364 the first thing in a branch because the x will have gone into firstbyte
2367 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2369 /* Deal with UTF-8 characters that take up more than one byte. It's
2370 easier to write this out separately than try to macrify it. Use c to
2371 hold the length of the character in bytes, plus 0x80 to flag that it's a
2372 length rather than a small character. */
2375 if (utf8 && (code[-1] & 0x80) != 0)
2377 uschar *lastchar = code - 1;
2378 while((*lastchar & 0xc0) == 0x80) lastchar--;
2379 c = code - lastchar; /* Length of UTF-8 character */
2380 memcpy(utf8_char, lastchar, c); /* Save the char */
2381 c |= 0x80; /* Flag c as a length */
2386 /* Handle the case of a single byte - either with no UTF8 support, or
2387 with UTF-8 disabled, or for a UTF-8 character < 128. */
2391 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2394 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2397 /* If previous was a single negated character ([^a] or similar), we use
2398 one of the special opcodes, replacing it. The code is shared with single-
2399 character repeats by setting opt_type to add a suitable offset into
2400 repeat_type. OP_NOT is currently used only for single-byte chars. */
2402 else if (*previous == OP_NOT)
2404 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2406 goto OUTPUT_SINGLE_REPEAT;
2409 /* If previous was a character type match (\d or similar), abolish it and
2410 create a suitable repeat item. The code is shared with single-character
2411 repeats by setting op_type to add a suitable offset into repeat_type. Note
2412 the the Unicode property types will be present only when SUPPORT_UCP is
2413 defined, but we don't wrap the little bits of code here because it just
2414 makes it horribly messy. */
2416 else if (*previous < OP_EODN)
2420 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2423 OUTPUT_SINGLE_REPEAT:
2424 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2428 code = previous; /* Usually overwrite previous item */
2430 /* If the maximum is zero then the minimum must also be zero; Perl allows
2431 this case, so we do too - by simply omitting the item altogether. */
2433 if (repeat_max == 0) goto END_REPEAT;
2435 /* All real repeats make it impossible to handle partial matching (maybe
2436 one day we will be able to remove this restriction). */
2438 if (repeat_max != 1) cd->nopartial = TRUE;
2440 /* Combine the op_type with the repeat_type */
2442 repeat_type += op_type;
2444 /* A minimum of zero is handled either as the special case * or ?, or as
2445 an UPTO, with the maximum given. */
2447 if (repeat_min == 0)
2449 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2450 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2453 *code++ = OP_UPTO + repeat_type;
2454 PUT2INC(code, 0, repeat_max);
2458 /* A repeat minimum of 1 is optimized into some special cases. If the
2459 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2460 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2461 one less than the maximum. */
2463 else if (repeat_min == 1)
2465 if (repeat_max == -1)
2466 *code++ = OP_PLUS + repeat_type;
2469 code = oldcode; /* leave previous item in place */
2470 if (repeat_max == 1) goto END_REPEAT;
2471 *code++ = OP_UPTO + repeat_type;
2472 PUT2INC(code, 0, repeat_max - 1);
2476 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2477 handled as an EXACT followed by an UPTO. */
2481 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2482 PUT2INC(code, 0, repeat_min);
2484 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2485 we have to insert the character for the previous code. For a repeated
2486 Unicode property match, there is an extra byte that defines the
2487 required property. In UTF-8 mode, long characters have their length in
2488 c, with the 0x80 bit as a flag. */
2493 if (utf8 && c >= 128)
2495 memcpy(code, utf8_char, c & 7);
2502 if (prop_type >= 0) *code++ = prop_type;
2504 *code++ = OP_STAR + repeat_type;
2507 /* Else insert an UPTO if the max is greater than the min, again
2508 preceded by the character, for the previously inserted code. */
2510 else if (repeat_max != repeat_min)
2513 if (utf8 && c >= 128)
2515 memcpy(code, utf8_char, c & 7);
2521 if (prop_type >= 0) *code++ = prop_type;
2522 repeat_max -= repeat_min;
2523 *code++ = OP_UPTO + repeat_type;
2524 PUT2INC(code, 0, repeat_max);
2528 /* The character or character type itself comes last in all cases. */
2531 if (utf8 && c >= 128)
2533 memcpy(code, utf8_char, c & 7);
2540 /* For a repeated Unicode property match, there is an extra byte that
2541 defines the required property. */
2544 if (prop_type >= 0) *code++ = prop_type;
2548 /* If previous was a character class or a back reference, we put the repeat
2549 stuff after it, but just skip the item if the repeat was {0,0}. */
2551 else if (*previous == OP_CLASS ||
2552 *previous == OP_NCLASS ||
2554 *previous == OP_XCLASS ||
2556 *previous == OP_REF)
2558 if (repeat_max == 0)
2564 /* All real repeats make it impossible to handle partial matching (maybe
2565 one day we will be able to remove this restriction). */
2567 if (repeat_max != 1) cd->nopartial = TRUE;
2569 if (repeat_min == 0 && repeat_max == -1)
2570 *code++ = OP_CRSTAR + repeat_type;
2571 else if (repeat_min == 1 && repeat_max == -1)
2572 *code++ = OP_CRPLUS + repeat_type;
2573 else if (repeat_min == 0 && repeat_max == 1)
2574 *code++ = OP_CRQUERY + repeat_type;
2577 *code++ = OP_CRRANGE + repeat_type;
2578 PUT2INC(code, 0, repeat_min);
2579 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2580 PUT2INC(code, 0, repeat_max);
2584 /* If previous was a bracket group, we may have to replicate it in certain
2587 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2588 *previous == OP_COND)
2592 int len = code - previous;
2593 uschar *bralink = NULL;
2595 /* If the maximum repeat count is unlimited, find the end of the bracket
2596 by scanning through from the start, and compute the offset back to it
2597 from the current code pointer. There may be an OP_OPT setting following
2598 the final KET, so we can't find the end just by going back from the code
2601 if (repeat_max == -1)
2603 register uschar *ket = previous;
2604 do ket += GET(ket, 1); while (*ket != OP_KET);
2605 ketoffset = code - ket;
2608 /* The case of a zero minimum is special because of the need to stick
2609 OP_BRAZERO in front of it, and because the group appears once in the
2610 data, whereas in other cases it appears the minimum number of times. For
2611 this reason, it is simplest to treat this case separately, as otherwise
2612 the code gets far too messy. There are several special subcases when the
2615 if (repeat_min == 0)
2617 /* If the maximum is also zero, we just omit the group from the output
2620 if (repeat_max == 0)
2626 /* If the maximum is 1 or unlimited, we just have to stick in the
2627 BRAZERO and do no more at this point. However, we do need to adjust
2628 any OP_RECURSE calls inside the group that refer to the group itself or
2629 any internal group, because the offset is from the start of the whole
2630 regex. Temporarily terminate the pattern while doing this. */
2632 if (repeat_max <= 1)
2635 adjust_recurse(previous, 1, utf8, cd);
2636 memmove(previous+1, previous, len);
2638 *previous++ = OP_BRAZERO + repeat_type;
2641 /* If the maximum is greater than 1 and limited, we have to replicate
2642 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2643 The first one has to be handled carefully because it's the original
2644 copy, which has to be moved up. The remainder can be handled by code
2645 that is common with the non-zero minimum case below. We have to
2646 adjust the value or repeat_max, since one less copy is required. Once
2647 again, we may have to adjust any OP_RECURSE calls inside the group. */
2653 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2654 memmove(previous + 2 + LINK_SIZE, previous, len);
2655 code += 2 + LINK_SIZE;
2656 *previous++ = OP_BRAZERO + repeat_type;
2657 *previous++ = OP_BRA;
2659 /* We chain together the bracket offset fields that have to be
2660 filled in later when the ends of the brackets are reached. */
2662 offset = (bralink == NULL)? 0 : previous - bralink;
2664 PUTINC(previous, 0, offset);
2670 /* If the minimum is greater than zero, replicate the group as many
2671 times as necessary, and adjust the maximum to the number of subsequent
2672 copies that we need. If we set a first char from the group, and didn't
2673 set a required char, copy the latter from the former. */
2679 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2680 for (i = 1; i < repeat_min; i++)
2682 memcpy(code, previous, len);
2686 if (repeat_max > 0) repeat_max -= repeat_min;
2689 /* This code is common to both the zero and non-zero minimum cases. If
2690 the maximum is limited, it replicates the group in a nested fashion,
2691 remembering the bracket starts on a stack. In the case of a zero minimum,
2692 the first one was set up above. In all cases the repeat_max now specifies
2693 the number of additional copies needed. */
2695 if (repeat_max >= 0)
2697 for (i = repeat_max - 1; i >= 0; i--)
2699 *code++ = OP_BRAZERO + repeat_type;
2701 /* All but the final copy start a new nesting, maintaining the
2702 chain of brackets outstanding. */
2708 offset = (bralink == NULL)? 0 : code - bralink;
2710 PUTINC(code, 0, offset);
2713 memcpy(code, previous, len);
2717 /* Now chain through the pending brackets, and fill in their length
2718 fields (which are holding the chain links pro tem). */
2720 while (bralink != NULL)
2723 int offset = code - bralink + 1;
2724 uschar *bra = code - offset;
2725 oldlinkoffset = GET(bra, 1);
2726 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2728 PUTINC(code, 0, offset);
2729 PUT(bra, 1, offset);
2733 /* If the maximum is unlimited, set a repeater in the final copy. We
2734 can't just offset backwards from the current code point, because we
2735 don't know if there's been an options resetting after the ket. The
2736 correct offset was computed above. */
2738 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2741 /* Else there's some kind of shambles */
2745 *errorcodeptr = ERR11;
2749 /* If the character following a repeat is '+', we wrap the entire repeated
2750 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2751 Sun's Java package. The repeated item starts at tempcode, not at previous,
2752 which might be the first part of a string whose (former) last char we
2753 repeated. However, we don't support '+' after a greediness '?'. */
2755 if (possessive_quantifier)
2757 int len = code - tempcode;
2758 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2759 code += 1 + LINK_SIZE;
2760 len += 1 + LINK_SIZE;
2761 tempcode[0] = OP_ONCE;
2763 PUTINC(code, 0, len);
2764 PUT(tempcode, 1, len);
2767 /* In all case we no longer have a previous item. We also set the
2768 "follows varying string" flag for subsequently encountered reqbytes if
2769 it isn't already set and we have just passed a varying length item. */
2773 cd->req_varyopt |= reqvary;
2777 /* Start of nested bracket sub-expression, or comment or lookahead or
2778 lookbehind or option setting or condition. First deal with special things
2779 that can come after a bracket; all are introduced by ?, and the appearance
2780 of any of them means that this is not a referencing group. They were
2781 checked for validity in the first pass over the string, so we don't have to
2782 check for syntax errors here. */
2785 newoptions = options;
2788 if (*(++ptr) == '?')
2795 case '#': /* Comment; skip to ket */
2797 while (*ptr != ')') ptr++;
2800 case ':': /* Non-extracting bracket */
2806 bravalue = OP_COND; /* Conditional group */
2808 /* Condition to test for recursion */
2812 code[1+LINK_SIZE] = OP_CREF;
2813 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2818 /* Condition to test for a numbered subpattern match. We know that
2819 if a digit follows ( then there will just be digits until ) because
2820 the syntax was checked in the first pass. */
2822 else if ((digitab[ptr[1]] && ctype_digit) != 0)
2824 int condref; /* Don't amalgamate; some compilers */
2825 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
2826 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
2829 *errorcodeptr = ERR35;
2833 code[1+LINK_SIZE] = OP_CREF;
2834 PUT2(code, 2+LINK_SIZE, condref);
2837 /* For conditions that are assertions, we just fall through, having
2838 set bravalue above. */
2841 case '=': /* Positive lookahead */
2842 bravalue = OP_ASSERT;
2846 case '!': /* Negative lookahead */
2847 bravalue = OP_ASSERT_NOT;
2851 case '<': /* Lookbehinds */
2854 case '=': /* Positive lookbehind */
2855 bravalue = OP_ASSERTBACK;
2859 case '!': /* Negative lookbehind */
2860 bravalue = OP_ASSERTBACK_NOT;
2866 case '>': /* One-time brackets */
2871 case 'C': /* Callout - may be followed by digits; */
2872 previous_callout = code; /* Save for later completion */
2873 after_manual_callout = 1; /* Skip one item before completing */
2874 *code++ = OP_CALLOUT; /* Already checked that the terminating */
2875 { /* closing parenthesis is present. */
2877 while ((digitab[*(++ptr)] & ctype_digit) != 0)
2878 n = n * 10 + *ptr - '0';
2881 *errorcodeptr = ERR38;
2885 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
2886 PUT(code, LINK_SIZE, 0); /* Default length */
2887 code += 2 * LINK_SIZE;
2892 case 'P': /* Named subpattern handling */
2893 if (*(++ptr) == '<') /* Definition */
2896 uschar *slot = cd->name_table;
2897 const uschar *name; /* Don't amalgamate; some compilers */
2898 name = ++ptr; /* grumble at autoincrement in declaration */
2900 while (*ptr++ != '>');
2901 namelen = ptr - name - 1;
2903 for (i = 0; i < cd->names_found; i++)
2905 int crc = memcmp(name, slot+2, namelen);
2908 if (slot[2+namelen] == 0)
2910 *errorcodeptr = ERR43;
2913 crc = -1; /* Current name is substring */
2917 memmove(slot + cd->name_entry_size, slot,
2918 (cd->names_found - i) * cd->name_entry_size);
2921 slot += cd->name_entry_size;
2924 PUT2(slot, 0, *brackets + 1);
2925 memcpy(slot + 2, name, namelen);
2926 slot[2+namelen] = 0;
2928 goto NUMBERED_GROUP;
2931 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
2935 const uschar *name = ptr;
2936 uschar *slot = cd->name_table;
2938 while (*ptr != ')') ptr++;
2939 namelen = ptr - name;
2941 for (i = 0; i < cd->names_found; i++)
2943 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2944 slot += cd->name_entry_size;
2946 if (i >= cd->names_found)
2948 *errorcodeptr = ERR15;
2952 recno = GET2(slot, 0);
2954 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
2956 /* Back reference */
2960 PUT2INC(code, 0, recno);
2961 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
2962 if (recno > cd->top_backref) cd->top_backref = recno;
2966 /* Should never happen */
2969 case 'R': /* Pattern recursion */
2970 ptr++; /* Same as (?0) */
2973 /* Recursion or "subroutine" call */
2975 case '0': case '1': case '2': case '3': case '4':
2976 case '5': case '6': case '7': case '8': case '9':
2978 const uschar *called;
2980 while((digitab[*ptr] & ctype_digit) != 0)
2981 recno = recno * 10 + *ptr++ - '0';
2983 /* Come here from code above that handles a named recursion */
2989 /* Find the bracket that is being referenced. Temporarily end the
2990 regex in case it doesn't exist. */
2993 called = (recno == 0)?
2994 cd->start_code : find_bracket(cd->start_code, utf8, recno);
2998 *errorcodeptr = ERR15;
3002 /* If the subpattern is still open, this is a recursive call. We
3003 check to see if this is a left recursion that could loop for ever,
3004 and diagnose that case. */
3006 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3008 *errorcodeptr = ERR40;
3012 /* Insert the recursion/subroutine item */
3015 PUT(code, 1, called - cd->start_code);
3016 code += 1 + LINK_SIZE;
3020 /* Character after (? not specially recognized */
3022 default: /* Option setting */
3026 while (*ptr != ')' && *ptr != ':')
3030 case '-': optset = &unset; break;
3032 case 'i': *optset |= PCRE_CASELESS; break;
3033 case 'm': *optset |= PCRE_MULTILINE; break;
3034 case 's': *optset |= PCRE_DOTALL; break;
3035 case 'x': *optset |= PCRE_EXTENDED; break;
3036 case 'U': *optset |= PCRE_UNGREEDY; break;
3037 case 'X': *optset |= PCRE_EXTRA; break;
3041 /* Set up the changed option bits, but don't change anything yet. */
3043 newoptions = (options | set) & (~unset);
3045 /* If the options ended with ')' this is not the start of a nested
3046 group with option changes, so the options change at this level. Compile
3047 code to change the ims options if this setting actually changes any of
3048 them. We also pass the new setting back so that it can be put at the
3049 start of any following branches, and when this group ends (if we are in
3050 a group), a resetting item can be compiled.
3052 Note that if this item is right at the start of the pattern, the
3053 options will have been abstracted and made global, so there will be no
3054 change to compile. */
3058 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3061 *code++ = newoptions & PCRE_IMS;
3064 /* Change options at this level, and pass them back for use
3065 in subsequent branches. Reset the greedy defaults and the case
3066 value for firstbyte and reqbyte. */
3068 *optionsptr = options = newoptions;
3069 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3070 greedy_non_default = greedy_default ^ 1;
3071 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3073 previous = NULL; /* This item can't be repeated */
3074 continue; /* It is complete */
3077 /* If the options ended with ':' we are heading into a nested group
3078 with possible change of options. Such groups are non-capturing and are
3079 not assertions of any kind. All we need to do is skip over the ':';
3080 the newoptions value is handled below. */
3087 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3088 non-capturing and behave like (?:...) brackets */
3090 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3095 /* Else we have a referencing group; adjust the opcode. If the bracket
3096 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3097 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3102 if (++(*brackets) > EXTRACT_BASIC_MAX)
3104 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3105 code[1+LINK_SIZE] = OP_BRANUMBER;
3106 PUT2(code, 2+LINK_SIZE, *brackets);
3109 else bravalue = OP_BRA + *brackets;
3112 /* Process nested bracketed re. Assertions may not be repeated, but other
3113 kinds can be. We copy code into a non-register variable in order to be able
3114 to pass its address because some compilers complain otherwise. Pass in a
3115 new setting for the ims options if they have changed. */
3117 previous = (bravalue >= OP_ONCE)? code : NULL;
3120 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3123 newoptions, /* The complete new option state */
3124 options & PCRE_IMS, /* The previous ims option state */
3125 brackets, /* Extracting bracket count */
3126 &tempcode, /* Where to put code (updated) */
3127 &ptr, /* Input pointer (updated) */
3128 errorcodeptr, /* Where to put an error message */
3129 (bravalue == OP_ASSERTBACK ||
3130 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3131 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3132 &subfirstbyte, /* For possible first char */
3133 &subreqbyte, /* For possible last char */
3134 bcptr, /* Current branch chain */
3135 cd)) /* Tables block */
3138 /* At the end of compiling, code is still pointing to the start of the
3139 group, while tempcode has been updated to point past the end of the group
3140 and any option resetting that may follow it. The pattern pointer (ptr)
3141 is on the bracket. */
3143 /* If this is a conditional bracket, check that there are no more than
3144 two branches in the group. */
3146 else if (bravalue == OP_COND)
3155 while (*tc != OP_KET);
3159 *errorcodeptr = ERR27;
3163 /* If there is just one branch, we must not make use of its firstbyte or
3164 reqbyte, because this is equivalent to an empty second branch. */
3166 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3169 /* Handle updating of the required and first characters. Update for normal
3170 brackets of all kinds, and conditions with two branches (see code above).
3171 If the bracket is followed by a quantifier with zero repeat, we have to
3172 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3173 main loop so that they can be accessed for the back off. */
3175 zeroreqbyte = reqbyte;
3176 zerofirstbyte = firstbyte;
3177 groupsetfirstbyte = FALSE;
3179 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3181 /* If we have not yet set a firstbyte in this branch, take it from the
3182 subpattern, remembering that it was set here so that a repeat of more
3183 than one can replicate it as reqbyte if necessary. If the subpattern has
3184 no firstbyte, set "none" for the whole branch. In both cases, a zero
3185 repeat forces firstbyte to "none". */
3187 if (firstbyte == REQ_UNSET)
3189 if (subfirstbyte >= 0)
3191 firstbyte = subfirstbyte;
3192 groupsetfirstbyte = TRUE;
3194 else firstbyte = REQ_NONE;
3195 zerofirstbyte = REQ_NONE;
3198 /* If firstbyte was previously set, convert the subpattern's firstbyte
3199 into reqbyte if there wasn't one, using the vary flag that was in
3200 existence beforehand. */
3202 else if (subfirstbyte >= 0 && subreqbyte < 0)
3203 subreqbyte = subfirstbyte | tempreqvary;
3205 /* If the subpattern set a required byte (or set a first byte that isn't
3206 really the first byte - see above), set it. */
3208 if (subreqbyte >= 0) reqbyte = subreqbyte;
3211 /* For a forward assertion, we take the reqbyte, if set. This can be
3212 helpful if the pattern that follows the assertion doesn't set a different
3213 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3214 for an assertion, however because it leads to incorrect effect for patterns
3215 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3216 of a firstbyte. This is overcome by a scan at the end if there's no
3217 firstbyte, looking for an asserted first char. */
3219 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3221 /* Now update the main code pointer to the end of the group. */
3225 /* Error if hit end of pattern */
3229 *errorcodeptr = ERR14;
3234 /* Check \ for being a real metacharacter; if not, fall through and handle
3235 it as a data character at the start of a string. Escape items are checked
3236 for validity in the pre-compiling pass. */
3240 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3242 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3243 are arranged to be the negation of the corresponding OP_values. For the
3244 back references, the values are ESC_REF plus the reference number. Only
3245 back references and those types that consume a character may be repeated.
3246 We can test for values between ESC_b and ESC_Z for the latter; this may
3247 have to change if any new ones are ever created. */
3251 if (-c == ESC_Q) /* Handle start of quoted string */
3253 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3258 /* For metasequences that actually match a character, we disable the
3259 setting of a first character if it hasn't already been set. */
3261 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3262 firstbyte = REQ_NONE;
3264 /* Set values to reset to if this is followed by a zero repeat. */
3266 zerofirstbyte = firstbyte;
3267 zeroreqbyte = reqbyte;
3269 /* Back references are handled specially */
3273 int number = -c - ESC_REF;
3276 PUT2INC(code, 0, number);
3279 /* So are Unicode property matches, if supported. We know that get_ucp
3280 won't fail because it was tested in the pre-pass. */
3283 else if (-c == ESC_P || -c == ESC_p)
3286 int value = get_ucp(&ptr, &negated, errorcodeptr);
3288 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3293 /* For the rest, we can obtain the OP value by negating the escape
3298 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3304 /* We have a data character whose value is in c. In UTF-8 mode it may have
3305 a value > 127. We set its representation in the length/buffer, and then
3306 handle it as a data character. */
3309 if (utf8 && c > 127)
3310 mclength = _pcre_ord2utf8(c, mcbuffer);
3321 /* Handle a literal character. It is guaranteed not to be whitespace or #
3322 when the extended flag is set. If we are in UTF-8 mode, it may be a
3323 multi-byte literal character. */
3331 if (utf8 && (c & 0xc0) == 0xc0)
3333 while ((ptr[1] & 0xc0) == 0x80)
3334 mcbuffer[mclength++] = *(++ptr);
3338 /* At this point we have the character's bytes in mcbuffer, and the length
3339 in mclength. When not in UTF-8 mode, the length is always 1. */
3343 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3344 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3346 /* Set the first and required bytes appropriately. If no previous first
3347 byte, set it from this character, but revert to none on a zero repeat.
3348 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3351 if (firstbyte == REQ_UNSET)
3353 zerofirstbyte = REQ_NONE;
3354 zeroreqbyte = reqbyte;
3356 /* If the character is more than one byte long, we can set firstbyte
3357 only if it is not to be matched caselessly. */
3359 if (mclength == 1 || req_caseopt == 0)
3361 firstbyte = mcbuffer[0] | req_caseopt;
3362 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3364 else firstbyte = reqbyte = REQ_NONE;
3367 /* firstbyte was previously set; we can set reqbyte only the length is
3368 1 or the matching is caseful. */
3372 zerofirstbyte = firstbyte;
3373 zeroreqbyte = reqbyte;
3374 if (mclength == 1 || req_caseopt == 0)
3375 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3378 break; /* End of literal character handling */
3380 } /* end of big loop */
3382 /* Control never reaches here by falling through, only by a goto for all the
3383 error states. Pass back the position in the pattern so that it can be displayed
3384 to the user for diagnosing the error. */
3394 /*************************************************
3395 * Compile sequence of alternatives *
3396 *************************************************/
3398 /* On entry, ptr is pointing past the bracket character, but on return
3399 it points to the closing bracket, or vertical bar, or end of string.
3400 The code variable is pointing at the byte into which the BRA operator has been
3401 stored. If the ims options are changed at the start (for a (?ims: group) or
3402 during any branch, we need to insert an OP_OPT item at the start of every
3403 following branch to ensure they get set correctly at run time, and also pass
3404 the new options into every subsequent branch compile.
3407 options option bits, including any changes for this subpattern
3408 oldims previous settings of ims option bits
3409 brackets -> int containing the number of extracting brackets used
3410 codeptr -> the address of the current code pointer
3411 ptrptr -> the address of the current pattern pointer
3412 errorcodeptr -> pointer to error code variable
3413 lookbehind TRUE if this is a lookbehind assertion
3414 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3415 firstbyteptr place to put the first required character, or a negative number
3416 reqbyteptr place to put the last required character, or a negative number
3417 bcptr pointer to the chain of currently open branches
3418 cd points to the data block with tables pointers etc.
3420 Returns: TRUE on success
3424 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3425 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3426 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3428 const uschar *ptr = *ptrptr;
3429 uschar *code = *codeptr;
3430 uschar *last_branch = code;
3431 uschar *start_bracket = code;
3432 uschar *reverse_count = NULL;
3433 int firstbyte, reqbyte;
3434 int branchfirstbyte, branchreqbyte;
3440 firstbyte = reqbyte = REQ_UNSET;
3442 /* Offset is set zero to mark that this bracket is still open */
3445 code += 1 + LINK_SIZE + skipbytes;
3447 /* Loop for each alternative branch */
3451 /* Handle a change of ims options at the start of the branch */
3453 if ((options & PCRE_IMS) != oldims)
3456 *code++ = options & PCRE_IMS;
3459 /* Set up dummy OP_REVERSE if lookbehind assertion */
3463 *code++ = OP_REVERSE;
3464 reverse_count = code;
3468 /* Now compile the branch */
3470 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3471 &branchfirstbyte, &branchreqbyte, &bc, cd))
3477 /* If this is the first branch, the firstbyte and reqbyte values for the
3478 branch become the values for the regex. */
3480 if (*last_branch != OP_ALT)
3482 firstbyte = branchfirstbyte;
3483 reqbyte = branchreqbyte;
3486 /* If this is not the first branch, the first char and reqbyte have to
3487 match the values from all the previous branches, except that if the previous
3488 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3489 REQ_VARY for the regex. */
3493 /* If we previously had a firstbyte, but it doesn't match the new branch,
3494 we have to abandon the firstbyte for the regex, but if there was previously
3495 no reqbyte, it takes on the value of the old firstbyte. */
3497 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3499 if (reqbyte < 0) reqbyte = firstbyte;
3500 firstbyte = REQ_NONE;
3503 /* If we (now or from before) have no firstbyte, a firstbyte from the
3504 branch becomes a reqbyte if there isn't a branch reqbyte. */
3506 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3507 branchreqbyte = branchfirstbyte;
3509 /* Now ensure that the reqbytes match */
3511 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3513 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3516 /* If lookbehind, check that this branch matches a fixed-length string,
3517 and put the length into the OP_REVERSE item. Temporarily mark the end of
3518 the branch with OP_END. */
3524 length = find_fixedlength(last_branch, options);
3525 DPRINTF(("fixed length = %d\n", length));
3528 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3532 PUT(reverse_count, 0, length);
3535 /* Reached end of expression, either ')' or end of pattern. Go back through
3536 the alternative branches and reverse the chain of offsets, with the field in
3537 the BRA item now becoming an offset to the first alternative. If there are
3538 no alternatives, it points to the end of the group. The length in the
3539 terminating ket is always the length of the whole bracketed item. If any of
3540 the ims options were changed inside the group, compile a resetting op-code
3541 following, except at the very end of the pattern. Return leaving the pointer
3542 at the terminating char. */
3546 int length = code - last_branch;
3549 int prev_length = GET(last_branch, 1);
3550 PUT(last_branch, 1, length);
3551 length = prev_length;
3552 last_branch -= length;
3556 /* Fill in the ket */
3559 PUT(code, 1, code - start_bracket);
3560 code += 1 + LINK_SIZE;
3562 /* Resetting option if needed */
3564 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3570 /* Set values to pass back */
3574 *firstbyteptr = firstbyte;
3575 *reqbyteptr = reqbyte;
3579 /* Another branch follows; insert an "or" node. Its length field points back
3580 to the previous branch while the bracket remains open. At the end the chain
3581 is reversed. It's done like this so that the start of the bracket has a
3582 zero offset until it is closed, making it possible to detect recursion. */
3585 PUT(code, 1, code - last_branch);
3586 bc.current = last_branch = code;
3587 code += 1 + LINK_SIZE;
3590 /* Control never reaches here */
3596 /*************************************************
3597 * Check for anchored expression *
3598 *************************************************/
3600 /* Try to find out if this is an anchored regular expression. Consider each
3601 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3602 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3603 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3604 counts, since OP_CIRC can match in the middle.
3606 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3607 This is the code for \G, which means "match at start of match position, taking
3608 into account the match offset".
3610 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3611 because that will try the rest of the pattern at all possible matching points,
3612 so there is no point trying again.... er ....
3614 .... except when the .* appears inside capturing parentheses, and there is a
3615 subsequent back reference to those parentheses. We haven't enough information
3616 to catch that case precisely.
3618 At first, the best we could do was to detect when .* was in capturing brackets
3619 and the highest back reference was greater than or equal to that level.
3620 However, by keeping a bitmap of the first 31 back references, we can catch some
3621 of the more common cases more precisely.
3624 code points to start of expression (the bracket)
3625 options points to the options setting
3626 bracket_map a bitmap of which brackets we are inside while testing; this
3627 handles up to substring 31; after that we just have to take
3628 the less precise approach
3629 backref_map the back reference bitmap
3631 Returns: TRUE or FALSE
3635 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3636 unsigned int backref_map)
3639 const uschar *scode =
3640 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3641 register int op = *scode;
3643 /* Capturing brackets */
3649 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3650 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3651 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3654 /* Other brackets */
3656 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3658 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3661 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3662 are or may be referenced. */
3664 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3665 (*options & PCRE_DOTALL) != 0)
3667 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3670 /* Check for explicit anchoring */
3672 else if (op != OP_SOD && op != OP_SOM &&
3673 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3675 code += GET(code, 1);
3677 while (*code == OP_ALT); /* Loop for each alternative */
3683 /*************************************************
3684 * Check for starting with ^ or .* *
3685 *************************************************/
3687 /* This is called to find out if every branch starts with ^ or .* so that
3688 "first char" processing can be done to speed things up in multiline
3689 matching and for non-DOTALL patterns that start with .* (which must start at
3690 the beginning or after \n). As in the case of is_anchored() (see above), we
3691 have to take account of back references to capturing brackets that contain .*
3692 because in that case we can't make the assumption.
3695 code points to start of expression (the bracket)
3696 bracket_map a bitmap of which brackets we are inside while testing; this
3697 handles up to substring 31; after that we just have to take
3698 the less precise approach
3699 backref_map the back reference bitmap
3701 Returns: TRUE or FALSE
3705 is_startline(const uschar *code, unsigned int bracket_map,
3706 unsigned int backref_map)
3709 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3711 register int op = *scode;
3713 /* Capturing brackets */
3719 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3720 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3721 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3724 /* Other brackets */
3726 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3727 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3729 /* .* means "start at start or after \n" if it isn't in brackets that
3730 may be referenced. */
3732 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3734 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3737 /* Check for explicit circumflex */
3739 else if (op != OP_CIRC) return FALSE;
3741 /* Move on to the next alternative */
3743 code += GET(code, 1);
3745 while (*code == OP_ALT); /* Loop for each alternative */
3751 /*************************************************
3752 * Check for asserted fixed first char *
3753 *************************************************/
3755 /* During compilation, the "first char" settings from forward assertions are
3756 discarded, because they can cause conflicts with actual literals that follow.
3757 However, if we end up without a first char setting for an unanchored pattern,
3758 it is worth scanning the regex to see if there is an initial asserted first
3759 char. If all branches start with the same asserted char, or with a bracket all
3760 of whose alternatives start with the same asserted char (recurse ad lib), then
3761 we return that char, otherwise -1.
3764 code points to start of expression (the bracket)
3765 options pointer to the options (used to check casing changes)
3766 inassert TRUE if in an assertion
3768 Returns: -1 or the fixed first char
3772 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3774 register int c = -1;
3777 const uschar *scode =
3778 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3779 register int op = *scode;
3781 if (op >= OP_BRA) op = OP_BRA;
3792 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3794 if (c < 0) c = d; else if (c != d) return -1;
3797 case OP_EXACT: /* Fall through */
3804 if (!inassert) return -1;
3808 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3810 else if (c != scode[1]) return -1;
3814 code += GET(code, 1);
3816 while (*code == OP_ALT);
3822 /*************************************************
3823 * Compile a Regular Expression *
3824 *************************************************/
3826 /* This function takes a string and returns a pointer to a block of store
3827 holding a compiled version of the expression. The original API for this
3828 function had no error code return variable; it is retained for backwards
3829 compatibility. The new function is given a new name.
3832 pattern the regular expression
3833 options various option bits
3834 errorcodeptr pointer to error code variable (pcre_compile2() only)
3835 can be NULL if you don't want a code value
3836 errorptr pointer to pointer to error text
3837 erroroffset ptr offset in pattern where error was detected
3838 tables pointer to character tables or NULL
3840 Returns: pointer to compiled data block, or NULL on error,
3841 with errorptr and erroroffset set
3845 pcre_compile(const char *pattern, int options, const char **errorptr,
3846 int *erroroffset, const unsigned char *tables)
3848 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
3853 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
3854 const char **errorptr, int *erroroffset, const unsigned char *tables)
3857 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
3858 int c, firstbyte, reqbyte;
3860 int branch_extra = 0;
3861 int branch_newextra;
3862 int item_count = -1;
3864 int max_name_size = 0;
3865 int lastitemlength = 0;
3871 BOOL inescq = FALSE;
3873 unsigned int brastackptr = 0;
3876 const uschar *codestart;
3878 compile_data compile_block;
3879 int brastack[BRASTACK_SIZE];
3880 uschar bralenstack[BRASTACK_SIZE];
3882 /* We can't pass back an error message if errorptr is NULL; I guess the best we
3883 can do is just return NULL, but we can set a code value if there is a code
3886 if (errorptr == NULL)
3888 if (errorcodeptr != NULL) *errorcodeptr = 99;
3893 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
3895 /* However, we can give a message for this error */
3897 if (erroroffset == NULL)
3900 goto PCRE_EARLY_ERROR_RETURN;
3905 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
3908 utf8 = (options & PCRE_UTF8) != 0;
3909 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
3910 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
3913 goto PCRE_EARLY_ERROR_RETURN;
3916 if ((options & PCRE_UTF8) != 0)
3919 goto PCRE_EARLY_ERROR_RETURN;
3923 if ((options & ~PUBLIC_OPTIONS) != 0)
3926 goto PCRE_EARLY_ERROR_RETURN;
3929 /* Set up pointers to the individual character tables */
3931 if (tables == NULL) tables = _pcre_default_tables;
3932 compile_block.lcc = tables + lcc_offset;
3933 compile_block.fcc = tables + fcc_offset;
3934 compile_block.cbits = tables + cbits_offset;
3935 compile_block.ctypes = tables + ctypes_offset;
3937 /* Maximum back reference and backref bitmap. This is updated for numeric
3938 references during the first pass, but for named references during the actual
3939 compile pass. The bitmap records up to 31 back references to help in deciding
3940 whether (.*) can be treated as anchored or not. */
3942 compile_block.top_backref = 0;
3943 compile_block.backref_map = 0;
3945 /* Reflect pattern for debugging output */
3947 DPRINTF(("------------------------------------------------------------------\n"));
3948 DPRINTF(("%s\n", pattern));
3950 /* The first thing to do is to make a pass over the pattern to compute the
3951 amount of store required to hold the compiled code. This does not have to be
3952 perfect as long as errors are overestimates. At the same time we can detect any
3953 flag settings right at the start, and extract them. Make an attempt to correct
3954 for any counted white space if an "extended" flag setting appears late in the
3955 pattern. We can't be so clever for #-comments. */
3957 ptr = (const uschar *)(pattern - 1);
3958 while ((c = *(++ptr)) != 0)
3965 /* If we are inside a \Q...\E sequence, all chars are literal */
3969 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
3973 /* Otherwise, first check for ignored whitespace and comments */
3975 if ((options & PCRE_EXTENDED) != 0)
3977 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
3980 /* The space before the ; is to avoid a warning on a silly compiler
3981 on the Macintosh. */
3982 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
3988 item_count++; /* Is zero for the first non-comment item */
3990 /* Allow space for auto callout before every item except quantifiers. */
3992 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
3993 c != '*' && c != '+' && c != '?' &&
3994 (c != '{' || !is_counted_repeat(ptr + 1)))
3995 length += 2 + 2*LINK_SIZE;
3999 /* A backslashed item may be an escaped data character or it may be a
4003 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4004 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4006 lastitemlength = 1; /* Default length of last item for repeats */
4008 if (c >= 0) /* Data character */
4010 length += 2; /* For a one-byte character */
4013 if (utf8 && c > 127)
4016 for (i = 0; i < _pcre_utf8_table1_size; i++)
4017 if (c <= _pcre_utf8_table1[i]) break;
4019 lastitemlength += i;
4026 /* If \Q, enter "literal" mode */
4034 /* \X is supported only if Unicode property support is compiled */
4040 goto PCRE_ERROR_RETURN;
4044 /* \P and \p are for Unicode properties, but only when the support has
4045 been compiled. Each item needs 2 bytes. */
4047 else if (-c == ESC_P || -c == ESC_p)
4053 if (get_ucp(&ptr, &negated, &errorcode) < 0) goto PCRE_ERROR_RETURN;
4057 goto PCRE_ERROR_RETURN;
4061 /* Other escapes need one byte */
4065 /* A back reference needs an additional 2 bytes, plus either one or 5
4066 bytes for a repeat. We also need to keep the value of the highest
4071 int refnum = -c - ESC_REF;
4072 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4073 if (refnum > compile_block.top_backref)
4074 compile_block.top_backref = refnum;
4075 length += 2; /* For single back reference */
4076 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4078 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4079 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4080 if ((min == 0 && (max == 1 || max == -1)) ||
4081 (min == 1 && max == -1))
4084 if (ptr[1] == '?') ptr++;
4089 case '^': /* Single-byte metacharacters */
4096 case '*': /* These repeats won't be after brackets; */
4097 case '+': /* those are handled separately */
4100 goto POSESSIVE; /* A few lines below */
4102 /* This covers the cases of braced repeats after a single char, metachar,
4103 class, or back reference. */
4106 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4107 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4108 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4110 /* These special cases just insert one extra opcode */
4112 if ((min == 0 && (max == 1 || max == -1)) ||
4113 (min == 1 && max == -1))
4116 /* These cases might insert additional copies of a preceding character. */
4122 length -= lastitemlength; /* Uncount the original char or metachar */
4123 if (min > 0) length += 3 + lastitemlength;
4125 length += lastitemlength + ((max > 0)? 3 : 1);
4128 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4130 POSESSIVE: /* Test for possessive quantifier */
4134 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4138 /* An alternation contains an offset to the next branch or ket. If any ims
4139 options changed in the previous branch(es), and/or if we are in a
4140 lookbehind assertion, extra space will be needed at the start of the
4141 branch. This is handled by branch_extra. */
4144 length += 1 + LINK_SIZE + branch_extra;
4147 /* A character class uses 33 characters provided that all the character
4148 values are less than 256. Otherwise, it uses a bit map for low valued
4149 characters, and individual items for others. Don't worry about character
4150 types that aren't allowed in classes - they'll get picked up during the
4151 compile. A character class that contains only one single-byte character
4152 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4153 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4156 if (*(++ptr) == '^')
4158 class_optcount = 10; /* Greater than one */
4161 else class_optcount = 0;
4167 /* Written as a "do" so that an initial ']' is taken as data */
4171 /* Inside \Q...\E everything is literal except \E */
4175 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4181 /* Outside \Q...\E, check for escapes */
4185 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4186 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4188 /* \b is backspace inside a class; \X is literal */
4190 if (-c == ESC_b) c = '\b';
4191 else if (-c == ESC_X) c = 'X';
4193 /* \Q enters quoting mode */
4195 else if (-c == ESC_Q)
4201 /* Handle escapes that turn into characters */
4203 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4205 /* Escapes that are meta-things. The normal ones just affect the
4206 bit map, but Unicode properties require an XCLASS extended item. */
4210 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4212 if (-c == ESC_p || -c == ESC_P)
4217 length += LINK_SIZE + 2;
4225 /* Check the syntax for POSIX stuff. The bits we actually handle are
4226 checked during the real compile phase. */
4228 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4231 class_optcount = 10; /* Make sure > 1 */
4234 /* Anything else increments the possible optimization count. We have to
4235 detect ranges here so that we can compute the number of extra ranges for
4236 caseless wide characters when UCP support is available. If there are wide
4237 characters, we are going to have to use an XCLASS, even for single
4250 GETCHARLEN(c, ptr, extra);
4258 /* Come here from handling \ above when it escapes to a char value */
4260 NON_SPECIAL_CHARACTER:
4266 uschar const *hyptr = ptr++;
4270 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4271 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4272 if (-d == ESC_b) d = '\b'; /* backspace */
4273 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4275 else if (ptr[1] != 0 && ptr[1] != ']')
4282 GETCHARLEN(d, ptr, extra);
4289 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4292 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4293 127 for caseless matching, we will need to use an XCLASS. */
4297 class_optcount = 10; /* Ensure > 1 */
4301 goto PCRE_ERROR_RETURN;
4305 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4308 if (!class_utf8) /* Allow for XCLASS overhead */
4311 length += LINK_SIZE + 2;
4315 /* If we have UCP support, find out how many extra ranges are
4316 needed to map the other case of characters within this range. We
4317 have to mimic the range optimization here, because extending the
4318 range upwards might push d over a boundary that makes is use
4319 another byte in the UTF-8 representation. */
4321 if ((options & PCRE_CASELESS) != 0)
4326 while (get_othercase_range(&cc, origd, &occ, &ocd))
4328 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4330 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4331 { /* if there is overlap, */
4332 c = occ; /* noting that if occ < c */
4333 continue; /* we can't have ocd > d */
4334 } /* because a subrange is */
4335 if (ocd > d && occ <= d + 1) /* always shorter than */
4336 { /* the basic range. */
4341 /* An extra item is needed */
4343 length += 1 + _pcre_ord2utf8(occ, buffer) +
4344 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4347 #endif /* SUPPORT_UCP */
4349 /* The length of the (possibly extended) range */
4351 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4353 #endif /* SUPPORT_UTF8 */
4357 /* We have a single character. There is nothing to be done unless we
4358 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4359 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4365 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4368 class_optcount = 10; /* Ensure > 1 */
4369 if (!class_utf8) /* Allow for XCLASS overhead */
4372 length += LINK_SIZE + 2;
4375 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4376 (1 + _pcre_ord2utf8(c, buffer));
4377 #else /* SUPPORT_UCP */
4378 length += 1 + _pcre_ord2utf8(c, buffer);
4379 #endif /* SUPPORT_UCP */
4381 #endif /* SUPPORT_UTF8 */
4385 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4387 if (*ptr == 0) /* Missing terminating ']' */
4390 goto PCRE_ERROR_RETURN;
4393 /* We can optimize when there was only one optimizable character. Repeats
4394 for positive and negated single one-byte chars are handled by the general
4395 code. Here, we handle repeats for the class opcodes. */
4397 if (class_optcount == 1) length += 3; else
4401 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4402 we also need extra for wrapping the whole thing in a sub-pattern. */
4404 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4406 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4407 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4408 if ((min == 0 && (max == 1 || max == -1)) ||
4409 (min == 1 && max == -1))
4415 length += 2 + 2*LINK_SIZE;
4417 else if (ptr[1] == '?') ptr++;
4422 /* Brackets may be genuine groups or special things */
4425 branch_newextra = 0;
4426 bracket_length = 1 + LINK_SIZE;
4429 /* Handle special forms of bracket, which all start (? */
4438 /* Skip over comments entirely */
4441 while (*ptr != 0 && *ptr != ')') ptr++;
4445 goto PCRE_ERROR_RETURN;
4449 /* Non-referencing groups and lookaheads just move the pointer on, and
4450 then behave like a non-special bracket, except that they don't increment
4451 the count of extracting brackets. Ditto for the "once only" bracket,
4452 which is in Perl from version 5.005. */
4461 /* (?R) specifies a recursive call to the regex, which is an extension
4462 to provide the facility which can be obtained by (?p{perl-code}) in
4463 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4465 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4466 the appropriate numbered brackets. This includes both recursive and
4467 non-recursive calls. (?R) is now synonymous with (?0). */
4472 case '0': case '1': case '2': case '3': case '4':
4473 case '5': case '6': case '7': case '8': case '9':
4476 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4480 goto PCRE_ERROR_RETURN;
4482 length += 1 + LINK_SIZE;
4484 /* If this item is quantified, it will get wrapped inside brackets so
4485 as to use the code for quantified brackets. We jump down and use the
4486 code that handles this for real brackets. */
4488 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4490 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4491 duplength = 5 + 3 * LINK_SIZE;
4492 goto HANDLE_QUANTIFIED_BRACKETS;
4496 /* (?C) is an extension which provides "callout" - to provide a bit of
4497 the functionality of the Perl (?{...}) feature. An optional number may
4498 follow (default is zero). */
4502 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4506 goto PCRE_ERROR_RETURN;
4508 length += 2 + 2*LINK_SIZE;
4511 /* Named subpatterns are an extension copied from Python */
4516 /* Handle the definition of a named subpattern */
4520 const uschar *p; /* Don't amalgamate; some compilers */
4521 p = ++ptr; /* grumble at autoincrement in declaration */
4522 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
4526 goto PCRE_ERROR_RETURN;
4529 if (ptr - p > max_name_size) max_name_size = (ptr - p);
4530 capturing = TRUE; /* Named parentheses are always capturing */
4534 /* Handle back references and recursive calls to named subpatterns */
4536 if (*ptr == '=' || *ptr == '>')
4538 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
4542 goto PCRE_ERROR_RETURN;
4547 /* Unknown character after (?P */
4550 goto PCRE_ERROR_RETURN;
4552 /* Lookbehinds are in Perl from version 5.005 */
4556 if (*ptr == '=' || *ptr == '!')
4558 branch_newextra = 1 + LINK_SIZE;
4559 length += 1 + LINK_SIZE; /* For the first branch */
4563 goto PCRE_ERROR_RETURN;
4565 /* Conditionals are in Perl from version 5.005. The bracket must either
4566 be followed by a number (for bracket reference) or by an assertion
4567 group, or (a PCRE extension) by 'R' for a recursion test. */
4570 if (ptr[3] == 'R' && ptr[4] == ')')
4575 else if ((digitab[ptr[3]] & ctype_digit) != 0)
4579 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
4583 goto PCRE_ERROR_RETURN;
4586 else /* An assertion must follow */
4588 ptr++; /* Can treat like ':' as far as spacing is concerned */
4589 if (ptr[2] != '?' ||
4590 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4592 ptr += 2; /* To get right offset in message */
4594 goto PCRE_ERROR_RETURN;
4599 /* Else loop checking valid options until ) is met. Anything else is an
4600 error. If we are without any brackets, i.e. at top level, the settings
4601 act as if specified in the options, so massage the options immediately.
4602 This is for backward compatibility with Perl 5.004. */
4615 *optset |= PCRE_CASELESS;
4619 *optset |= PCRE_MULTILINE;
4623 *optset |= PCRE_DOTALL;
4627 *optset |= PCRE_EXTENDED;
4631 *optset |= PCRE_EXTRA;
4635 *optset |= PCRE_UNGREEDY;
4642 /* A termination by ')' indicates an options-setting-only item; if
4643 this is at the very start of the pattern (indicated by item_count
4644 being zero), we use it to set the global options. This is helpful
4645 when analyzing the pattern for first characters, etc. Otherwise
4646 nothing is done here and it is handled during the compiling
4649 We allow for more than one options setting at the start. If such
4650 settings do not change the existing options, nothing is compiled.
4651 However, we must leave space just in case something is compiled.
4652 This can happen for pathological sequences such as (?i)(?-i)
4653 because the global options will end up with -i set. The space is
4654 small and not significant. (Before I did this there was a reported
4655 bug with (?i)(?-i) in a machine-generated pattern.)
4657 [Historical note: Up to Perl 5.8, options settings at top level
4658 were always global settings, wherever they appeared in the pattern.
4659 That is, they were equivalent to an external setting. From 5.8
4660 onwards, they apply only to what follows (which is what you might
4664 if (item_count == 0)
4666 options = (options | set) & (~unset);
4667 set = unset = 0; /* To save length */
4668 item_count--; /* To allow for several */
4674 /* A termination by ':' indicates the start of a nested group with
4675 the given options set. This is again handled at compile time, but
4676 we must allow for compiled space if any of the ims options are
4677 set. We also have to allow for resetting space at the end of
4678 the group, which is why 4 is added to the length and not just 2.
4679 If there are several changes of options within the same group, this
4680 will lead to an over-estimate on the length, but this shouldn't
4681 matter very much. We also have to allow for resetting options at
4682 the start of any alternations, which we do by setting
4683 branch_newextra to 2. Finally, we record whether the case-dependent
4684 flag ever changes within the regex. This is used by the "required
4688 if (((set|unset) & PCRE_IMS) != 0)
4691 branch_newextra = 2;
4692 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
4696 /* Unrecognized option character */
4700 goto PCRE_ERROR_RETURN;
4704 /* If we hit a closing bracket, that's it - this is a freestanding
4705 option-setting. We need to ensure that branch_extra is updated if
4706 necessary. The only values branch_newextra can have here are 0 or 2.
4707 If the value is 2, then branch_extra must either be 2 or 5, depending
4708 on whether this is a lookbehind group or not. */
4713 if (branch_newextra == 2 &&
4714 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4715 branch_extra += branch_newextra;
4719 /* If options were terminated by ':' control comes here. This is a
4720 non-capturing group with an options change. There is nothing more that
4721 needs to be done because "capturing" is already set FALSE by default;
4722 we can just fall through. */
4727 /* Ordinary parentheses, not followed by '?', are capturing unless
4728 PCRE_NO_AUTO_CAPTURE is set. */
4730 else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
4732 /* Capturing brackets must be counted so we can process escapes in a
4733 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
4734 an additional 3 bytes of memory per capturing bracket. */
4739 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4742 /* Save length for computing whole length at end if there's a repeat that
4743 requires duplication of the group. Also save the current value of
4744 branch_extra, and start the new group with the new value. If non-zero, this
4745 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4747 if (brastackptr >= sizeof(brastack)/sizeof(int))
4750 goto PCRE_ERROR_RETURN;
4753 bralenstack[brastackptr] = branch_extra;
4754 branch_extra = branch_newextra;
4756 brastack[brastackptr++] = length;
4757 length += bracket_length;
4760 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4761 have to replicate this bracket up to that many times. If brastackptr is
4762 0 this is an unmatched bracket which will generate an error, but take care
4763 not to try to access brastack[-1] when computing the length and restoring
4764 the branch_extra value. */
4767 length += 1 + LINK_SIZE;
4768 if (brastackptr > 0)
4770 duplength = length - brastack[--brastackptr];
4771 branch_extra = bralenstack[brastackptr];
4775 /* The following code is also used when a recursion such as (?3) is
4776 followed by a quantifier, because in that case, it has to be wrapped inside
4777 brackets so that the quantifier works. The value of duplength must be
4778 set before arrival. */
4780 HANDLE_QUANTIFIED_BRACKETS:
4782 /* Leave ptr at the final char; for read_repeat_counts this happens
4783 automatically; for the others we need an increment. */
4785 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
4787 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4788 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4790 else if (c == '*') { min = 0; max = -1; ptr++; }
4791 else if (c == '+') { min = 1; max = -1; ptr++; }
4792 else if (c == '?') { min = 0; max = 1; ptr++; }
4793 else { min = 1; max = 1; }
4795 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
4796 group, and if the maximum is greater than zero, we have to replicate
4797 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
4803 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
4806 /* When the minimum is greater than zero, we have to replicate up to
4807 minval-1 times, with no additions required in the copies. Then, if there
4808 is a limited maximum we have to replicate up to maxval-1 times allowing
4809 for a BRAZERO item before each optional copy and nesting brackets for all
4810 but one of the optional copies. */
4814 length += (min - 1) * duplength;
4815 if (max > min) /* Need this test as max=-1 means no limit */
4816 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
4817 - (2 + 2*LINK_SIZE);
4820 /* Allow space for once brackets for "possessive quantifier" */
4825 length += 2 + 2*LINK_SIZE;
4829 /* Non-special character. It won't be space or # in extended mode, so it is
4830 always a genuine character. If we are in a \Q...\E sequence, check for the
4831 end; if not, we have a literal. */
4836 if (inescq && c == '\\' && ptr[1] == 'E')
4843 length += 2; /* For a one-byte character */
4844 lastitemlength = 1; /* Default length of last item for repeats */
4846 /* In UTF-8 mode, check for additional bytes. */
4849 if (utf8 && (c & 0xc0) == 0xc0)
4851 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
4852 { /* because the end is marked */
4853 lastitemlength++; /* by a zero byte. */
4864 length += 2 + LINK_SIZE; /* For final KET and END */
4866 if ((options & PCRE_AUTO_CALLOUT) != 0)
4867 length += 2 + 2*LINK_SIZE; /* For final callout */
4869 if (length > MAX_PATTERN_SIZE)
4872 goto PCRE_EARLY_ERROR_RETURN;
4875 /* Compute the size of data block needed and get it, either from malloc or
4876 externally provided function. */
4878 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
4879 re = (real_pcre *)(pcre_malloc)(size);
4884 goto PCRE_EARLY_ERROR_RETURN;
4887 /* Put in the magic number, and save the sizes, options, and character table
4888 pointer. NULL is used for the default character tables. The nullpad field is at
4889 the end; it's there to help in the case when a regex compiled on a system with
4890 4-byte pointers is run on another with 8-byte pointers. */
4892 re->magic_number = MAGIC_NUMBER;
4894 re->options = options;
4896 re->name_table_offset = sizeof(real_pcre);
4897 re->name_entry_size = max_name_size + 3;
4898 re->name_count = name_count;
4900 re->tables = (tables == _pcre_default_tables)? NULL : tables;
4903 /* The starting points of the name/number translation table and of the code are
4904 passed around in the compile data block. */
4906 compile_block.names_found = 0;
4907 compile_block.name_entry_size = max_name_size + 3;
4908 compile_block.name_table = (uschar *)re + re->name_table_offset;
4909 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
4910 compile_block.start_code = codestart;
4911 compile_block.start_pattern = (const uschar *)pattern;
4912 compile_block.req_varyopt = 0;
4913 compile_block.nopartial = FALSE;
4915 /* Set up a starting, non-extracting bracket, then compile the expression. On
4916 error, errorcode will be set non-zero, so we don't need to look at the result
4917 of the function here. */
4919 ptr = (const uschar *)pattern;
4920 code = (uschar *)codestart;
4923 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
4924 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
4925 re->top_bracket = bracount;
4926 re->top_backref = compile_block.top_backref;
4928 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
4930 /* If not reached end of pattern on success, there's an excess bracket. */
4932 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
4934 /* Fill in the terminating state and check for disastrous overflow, but
4935 if debugging, leave the test till after things are printed out. */
4940 if (code - codestart > length) errorcode = ERR23;
4943 /* Give an error if there's back reference to a non-existent capturing
4946 if (re->top_backref > re->top_bracket) errorcode = ERR15;
4948 /* Failed to compile, or error while post-processing */
4954 *erroroffset = ptr - (const uschar *)pattern;
4955 PCRE_EARLY_ERROR_RETURN:
4956 *errorptr = error_texts[errorcode];
4957 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
4961 /* If the anchored option was not passed, set the flag if we can determine that
4962 the pattern is anchored by virtue of ^ characters or \A or anything else (such
4963 as starting with .* when DOTALL is set).
4965 Otherwise, if we know what the first character has to be, save it, because that
4966 speeds up unanchored matches no end. If not, see if we can set the
4967 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
4968 start with ^. and also when all branches start with .* for non-DOTALL matches.
4971 if ((options & PCRE_ANCHORED) == 0)
4973 int temp_options = options;
4974 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
4975 re->options |= PCRE_ANCHORED;
4979 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
4980 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
4982 int ch = firstbyte & 255;
4983 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
4984 compile_block.fcc[ch] == ch)? ch : firstbyte;
4985 re->options |= PCRE_FIRSTSET;
4987 else if (is_startline(codestart, 0, compile_block.backref_map))
4988 re->options |= PCRE_STARTLINE;
4992 /* For an anchored pattern, we use the "required byte" only if it follows a
4993 variable length item in the regex. Remove the caseless flag for non-caseable
4997 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
4999 int ch = reqbyte & 255;
5000 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5001 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5002 re->options |= PCRE_REQCHSET;
5005 /* Print out the compiled data for debugging */
5009 printf("Length = %d top_bracket = %d top_backref = %d\n",
5010 length, re->top_bracket, re->top_backref);
5012 if (re->options != 0)
5014 printf("%s%s%s%s%s%s%s%s%s%s\n",
5015 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5016 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5017 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5018 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5019 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5020 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5021 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5022 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5023 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5024 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5027 if ((re->options & PCRE_FIRSTSET) != 0)
5029 int ch = re->first_byte & 255;
5030 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5031 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5032 else printf("First char = \\x%02x%s\n", ch, caseless);
5035 if ((re->options & PCRE_REQCHSET) != 0)
5037 int ch = re->req_byte & 255;
5038 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5039 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5040 else printf("Req char = \\x%02x%s\n", ch, caseless);
5043 _pcre_printint(re, stdout);
5045 /* This check is done here in the debugging case so that the code that
5046 was compiled can be seen. */
5048 if (code - codestart > length)
5051 *errorptr = error_texts[ERR23];
5052 *erroroffset = ptr - (uschar *)pattern;
5053 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5061 /* End of pcre_compile.c */