1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2006 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
47 #define NLBLOCK cd /* Block containing newline information */
48 #define PSSTART start_pattern /* Field containing processed string start */
49 #define PSEND end_pattern /* Field containing processed string end */
52 #include "pcre_internal.h"
55 /* When DEBUG is defined, we need the pcre_printint() function, which is also
56 used by pcretest. DEBUG is not defined when building a production library. */
59 #include "pcre_printint.src"
63 /*************************************************
64 * Code parameters and static tables *
65 *************************************************/
67 /* This value specifies the size of stack workspace that is used during the
68 first pre-compile phase that determines how much memory is required. The regex
69 is partly compiled into this space, but the compiled parts are discarded as
70 soon as they can be, so that hopefully there will never be an overrun. The code
71 does, however, check for an overrun. The largest amount I've seen used is 218,
72 so this number is very generous.
74 The same workspace is used during the second, actual compile phase for
75 remembering forward references to groups so that they can be filled in at the
76 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
77 is 4 there is plenty of room. */
79 #define COMPILE_WORK_SIZE (4096)
82 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
83 are simple data values; negative values are for special things like \d and so
84 on. Zero means further processing is needed (for things like \x), or the escape
87 #if !EBCDIC /* This is the "normal" table for ASCII systems */
88 static const short int escapes[] = {
89 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
90 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
91 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
92 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
93 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
94 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
95 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
96 0, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
97 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
98 0, 0, -ESC_z /* x - z */
101 #else /* This is the "abnormal" table for EBCDIC systems */
102 static const short int escapes[] = {
103 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
104 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
105 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
106 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
107 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
108 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
109 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
110 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
111 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
112 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
113 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
114 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
115 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
116 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
117 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
118 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
119 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
120 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
121 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
122 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
123 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
124 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
125 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
130 /* Tables of names of POSIX character classes and their lengths. The list is
131 terminated by a zero length entry. The first three must be alpha, lower, upper,
132 as this is assumed for handling case independence. */
134 static const char *const posix_names[] = {
135 "alpha", "lower", "upper",
136 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
137 "print", "punct", "space", "word", "xdigit" };
139 static const uschar posix_name_lengths[] = {
140 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
142 /* Table of class bit maps for each POSIX class. Each class is formed from a
143 base map, with an optional addition or removal of another map. Then, for some
144 classes, there is some additional tweaking: for [:blank:] the vertical space
145 characters are removed, and for [:alpha:] and [:alnum:] the underscore
146 character is removed. The triples in the table consist of the base map offset,
147 second map offset or -1 if no second map, and a non-negative value for map
148 addition or a negative value for map subtraction (if there are two maps). The
149 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
150 remove vertical space characters, 2 => remove underscore. */
152 static const int posix_class_maps[] = {
153 cbit_word, cbit_digit, -2, /* alpha */
154 cbit_lower, -1, 0, /* lower */
155 cbit_upper, -1, 0, /* upper */
156 cbit_word, -1, 2, /* alnum - word without underscore */
157 cbit_print, cbit_cntrl, 0, /* ascii */
158 cbit_space, -1, 1, /* blank - a GNU extension */
159 cbit_cntrl, -1, 0, /* cntrl */
160 cbit_digit, -1, 0, /* digit */
161 cbit_graph, -1, 0, /* graph */
162 cbit_print, -1, 0, /* print */
163 cbit_punct, -1, 0, /* punct */
164 cbit_space, -1, 0, /* space */
165 cbit_word, -1, 0, /* word - a Perl extension */
166 cbit_xdigit,-1, 0 /* xdigit */
170 #define STRING(a) # a
171 #define XSTRING(s) STRING(s)
173 /* The texts of compile-time error messages. These are "char *" because they
174 are passed to the outside world. Do not ever re-use any error number, because
175 they are documented. Always add a new error instead. Messages marked DEAD below
176 are no longer used. */
178 static const char *error_texts[] = {
180 "\\ at end of pattern",
181 "\\c at end of pattern",
182 "unrecognized character follows \\",
183 "numbers out of order in {} quantifier",
185 "number too big in {} quantifier",
186 "missing terminating ] for character class",
187 "invalid escape sequence in character class",
188 "range out of order in character class",
191 "operand of unlimited repeat could match the empty string", /** DEAD **/
192 "internal error: unexpected repeat",
193 "unrecognized character after (?",
194 "POSIX named classes are supported only within a class",
197 "reference to non-existent subpattern",
198 "erroffset passed as NULL",
199 "unknown option bit(s) set",
200 "missing ) after comment",
201 "parentheses nested too deeply", /** DEAD **/
203 "regular expression too large",
204 "failed to get memory",
205 "unmatched parentheses",
206 "internal error: code overflow",
207 "unrecognized character after (?<",
209 "lookbehind assertion is not fixed length",
210 "malformed number or name after (?(",
211 "conditional group contains more than two branches",
212 "assertion expected after (?(",
213 "(?R or (?digits must be followed by )",
215 "unknown POSIX class name",
216 "POSIX collating elements are not supported",
217 "this version of PCRE is not compiled with PCRE_UTF8 support",
218 "spare error", /** DEAD **/
219 "character value in \\x{...} sequence is too large",
221 "invalid condition (?(0)",
222 "\\C not allowed in lookbehind assertion",
223 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
224 "number after (?C is > 255",
225 "closing ) for (?C expected",
227 "recursive call could loop indefinitely",
228 "unrecognized character after (?P",
229 "syntax error in subpattern name (missing terminator)",
230 "two named subpatterns have the same name",
231 "invalid UTF-8 string",
233 "support for \\P, \\p, and \\X has not been compiled",
234 "malformed \\P or \\p sequence",
235 "unknown property name after \\P or \\p",
236 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
237 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
239 "repeated subpattern is too long",
240 "octal value is greater than \\377 (not in UTF-8 mode)",
241 "internal error: overran compiling workspace",
242 "internal error: previously-checked referenced subpattern not found",
243 "DEFINE group contains more than one branch",
245 "repeating a DEFINE group is not allowed",
246 "inconsistent NEWLINE options",
247 "\\g is not followed by an (optionally braced) non-zero number"
251 /* Table to identify digits and hex digits. This is used when compiling
252 patterns. Note that the tables in chartables are dependent on the locale, and
253 may mark arbitrary characters as digits - but the PCRE compiling code expects
254 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
255 a private table here. It costs 256 bytes, but it is a lot faster than doing
256 character value tests (at least in some simple cases I timed), and in some
257 applications one wants PCRE to compile efficiently as well as match
260 For convenience, we use the same bit definitions as in chartables:
263 0x08 hexadecimal digit
265 Then we can use ctype_digit and ctype_xdigit in the code. */
267 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
268 static const unsigned char digitab[] =
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
276 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
277 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
278 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
282 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
283 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
301 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
303 #else /* This is the "abnormal" case, for EBCDIC systems */
304 static const unsigned char digitab[] =
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
309 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
315 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
316 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
317 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
318 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
319 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
320 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
321 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
322 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
323 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
330 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
335 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
336 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
337 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
339 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
340 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
341 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
342 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
344 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
348 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
349 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
351 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
353 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
356 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
357 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
358 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
359 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
360 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
361 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
362 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
363 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
364 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
365 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
366 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
367 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
368 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
369 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
370 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
371 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
375 /* Definition to allow mutual recursion */
378 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, int, int *,
379 int *, branch_chain *, compile_data *, int *);
383 /*************************************************
385 *************************************************/
387 /* This function is called when a \ has been encountered. It either returns a
388 positive value for a simple escape such as \n, or a negative value which
389 encodes one of the more complicated things such as \d. A backreference to group
390 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
391 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
392 ptr is pointing at the \. On exit, it is on the final character of the escape
396 ptrptr points to the pattern position pointer
397 errorcodeptr points to the errorcode variable
398 bracount number of previous extracting brackets
399 options the options bits
400 isclass TRUE if inside a character class
402 Returns: zero or positive => a data character
403 negative => a special escape sequence
404 on error, errorptr is set
408 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
409 int options, BOOL isclass)
411 BOOL utf8 = (options & PCRE_UTF8) != 0;
412 const uschar *ptr = *ptrptr + 1;
415 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
416 ptr--; /* Set pointer back to the last byte */
418 /* If backslash is at the end of the pattern, it's an error. */
420 if (c == 0) *errorcodeptr = ERR1;
422 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
423 a table. A non-zero result is something that can be returned immediately.
424 Otherwise further processing may be required. */
426 #if !EBCDIC /* ASCII coding */
427 else if (c < '0' || c > 'z') {} /* Not alphameric */
428 else if ((i = escapes[c - '0']) != 0) c = i;
430 #else /* EBCDIC coding */
431 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
432 else if ((i = escapes[c - 0x48]) != 0) c = i;
435 /* Escapes that need further processing, or are illegal. */
439 const uschar *oldptr;
440 BOOL braced, negated;
444 /* A number of Perl escapes are not handled by PCRE. We give an explicit
452 *errorcodeptr = ERR37;
455 /* \g must be followed by a number, either plain or braced. If positive, it
456 is an absolute backreference. If negative, it is a relative backreference.
457 This is a Perl 5.10 feature. */
472 else negated = FALSE;
475 while ((digitab[ptr[1]] & ctype_digit) != 0)
476 c = c * 10 + *(++ptr) - '0';
478 if (c == 0 || (braced && *(++ptr) != '}'))
480 *errorcodeptr = ERR57;
488 *errorcodeptr = ERR15;
491 c = bracount - (c - 1);
497 /* The handling of escape sequences consisting of a string of digits
498 starting with one that is not zero is not straightforward. By experiment,
499 the way Perl works seems to be as follows:
501 Outside a character class, the digits are read as a decimal number. If the
502 number is less than 10, or if there are that many previous extracting
503 left brackets, then it is a back reference. Otherwise, up to three octal
504 digits are read to form an escaped byte. Thus \123 is likely to be octal
505 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
506 value is greater than 377, the least significant 8 bits are taken. Inside a
507 character class, \ followed by a digit is always an octal number. */
509 case '1': case '2': case '3': case '4': case '5':
510 case '6': case '7': case '8': case '9':
516 while ((digitab[ptr[1]] & ctype_digit) != 0)
517 c = c * 10 + *(++ptr) - '0';
518 if (c < 10 || c <= bracount)
523 ptr = oldptr; /* Put the pointer back and fall through */
526 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
527 generates a binary zero byte and treats the digit as a following literal.
528 Thus we have to pull back the pointer by one. */
530 if ((c = *ptr) >= '8')
537 /* \0 always starts an octal number, but we may drop through to here with a
538 larger first octal digit. The original code used just to take the least
539 significant 8 bits of octal numbers (I think this is what early Perls used
540 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
541 than 3 octal digits. */
545 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
546 c = c * 8 + *(++ptr) - '0';
547 if (!utf8 && c > 255) *errorcodeptr = ERR51;
550 /* \x is complicated. \x{ddd} is a character number which can be greater
551 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
552 treated as a data character. */
557 const uschar *pt = ptr + 2;
561 while ((digitab[*pt] & ctype_xdigit) != 0)
563 register int cc = *pt++;
564 if (c == 0 && cc == '0') continue; /* Leading zeroes */
567 #if !EBCDIC /* ASCII coding */
568 if (cc >= 'a') cc -= 32; /* Convert to upper case */
569 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
570 #else /* EBCDIC coding */
571 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
572 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
578 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
583 /* If the sequence of hex digits does not end with '}', then we don't
584 recognize this construct; fall through to the normal \x handling. */
587 /* Read just a single-byte hex-defined char */
590 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
592 int cc; /* Some compilers don't like ++ */
593 cc = *(++ptr); /* in initializers */
594 #if !EBCDIC /* ASCII coding */
595 if (cc >= 'a') cc -= 32; /* Convert to upper case */
596 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
597 #else /* EBCDIC coding */
598 if (cc <= 'z') cc += 64; /* Convert to upper case */
599 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
604 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
605 This coding is ASCII-specific, but then the whole concept of \cx is
606 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
612 *errorcodeptr = ERR2;
616 #if !EBCDIC /* ASCII coding */
617 if (c >= 'a' && c <= 'z') c -= 32;
619 #else /* EBCDIC coding */
620 if (c >= 'a' && c <= 'z') c += 64;
625 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
626 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
627 for Perl compatibility, it is a literal. This code looks a bit odd, but
628 there used to be some cases other than the default, and there may be again
629 in future, so I haven't "optimized" it. */
632 if ((options & PCRE_EXTRA) != 0) switch(c)
635 *errorcodeptr = ERR3;
649 /*************************************************
651 *************************************************/
653 /* This function is called after \P or \p has been encountered, provided that
654 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
655 pointing at the P or p. On exit, it is pointing at the final character of the
659 ptrptr points to the pattern position pointer
660 negptr points to a boolean that is set TRUE for negation else FALSE
661 dptr points to an int that is set to the detailed property value
662 errorcodeptr points to the error code variable
664 Returns: type value from ucp_type_table, or -1 for an invalid type
668 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
671 const uschar *ptr = *ptrptr;
675 if (c == 0) goto ERROR_RETURN;
679 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
689 for (i = 0; i < sizeof(name) - 1; i++)
692 if (c == 0) goto ERROR_RETURN;
696 if (c !='}') goto ERROR_RETURN;
700 /* Otherwise there is just one following character */
710 /* Search for a recognized property name using binary chop */
713 top = _pcre_utt_size;
717 i = (bot + top) >> 1;
718 c = strcmp(name, _pcre_utt[i].name);
721 *dptr = _pcre_utt[i].value;
722 return _pcre_utt[i].type;
724 if (c > 0) bot = i + 1; else top = i;
727 *errorcodeptr = ERR47;
732 *errorcodeptr = ERR46;
741 /*************************************************
742 * Check for counted repeat *
743 *************************************************/
745 /* This function is called when a '{' is encountered in a place where it might
746 start a quantifier. It looks ahead to see if it really is a quantifier or not.
747 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
748 where the ddds are digits.
751 p pointer to the first char after '{'
753 Returns: TRUE or FALSE
757 is_counted_repeat(const uschar *p)
759 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
760 while ((digitab[*p] & ctype_digit) != 0) p++;
761 if (*p == '}') return TRUE;
763 if (*p++ != ',') return FALSE;
764 if (*p == '}') return TRUE;
766 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
767 while ((digitab[*p] & ctype_digit) != 0) p++;
774 /*************************************************
775 * Read repeat counts *
776 *************************************************/
778 /* Read an item of the form {n,m} and return the values. This is called only
779 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
780 so the syntax is guaranteed to be correct, but we need to check the values.
783 p pointer to first char after '{'
784 minp pointer to int for min
785 maxp pointer to int for max
786 returned as -1 if no max
787 errorcodeptr points to error code variable
789 Returns: pointer to '}' on success;
790 current ptr on error, with errorcodeptr set non-zero
793 static const uschar *
794 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
799 /* Read the minimum value and do a paranoid check: a negative value indicates
800 an integer overflow. */
802 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
803 if (min < 0 || min > 65535)
805 *errorcodeptr = ERR5;
809 /* Read the maximum value if there is one, and again do a paranoid on its size.
810 Also, max must not be less than min. */
812 if (*p == '}') max = min; else
817 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
818 if (max < 0 || max > 65535)
820 *errorcodeptr = ERR5;
825 *errorcodeptr = ERR4;
831 /* Fill in the required variables, and pass back the pointer to the terminating
841 /*************************************************
842 * Find forward referenced subpattern *
843 *************************************************/
845 /* This function scans along a pattern's text looking for capturing
846 subpatterns, and counting them. If it finds a named pattern that matches the
847 name it is given, it returns its number. Alternatively, if the name is NULL, it
848 returns when it reaches a given numbered subpattern. This is used for forward
849 references to subpatterns. We know that if (?P< is encountered, the name will
850 be terminated by '>' because that is checked in the first pass.
853 ptr current position in the pattern
854 count current count of capturing parens so far encountered
855 name name to seek, or NULL if seeking a numbered subpattern
856 lorn name length, or subpattern number if name is NULL
857 xmode TRUE if we are in /x mode
859 Returns: the number of the named subpattern, or -1 if not found
863 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
866 const uschar *thisname;
868 for (; *ptr != 0; ptr++)
872 /* Skip over backslashed characters and also entire \Q...\E */
876 if (*(++ptr) == 0) return -1;
877 if (*ptr == 'Q') for (;;)
879 while (*(++ptr) != 0 && *ptr != '\\');
880 if (*ptr == 0) return -1;
881 if (*(++ptr) == 'E') break;
886 /* Skip over character classes */
890 while (*(++ptr) != ']')
894 if (*(++ptr) == 0) return -1;
895 if (*ptr == 'Q') for (;;)
897 while (*(++ptr) != 0 && *ptr != '\\');
898 if (*ptr == 0) return -1;
899 if (*(++ptr) == 'E') break;
907 /* Skip comments in /x mode */
909 if (xmode && *ptr == '#')
911 while (*(++ptr) != 0 && *ptr != '\n');
912 if (*ptr == 0) return -1;
916 /* An opening parens must now be a real metacharacter */
918 if (*ptr != '(') continue;
922 if (name == NULL && count == lorn) return count;
927 if (*ptr == 'P') ptr++; /* Allow optional P */
929 /* We have to disambiguate (?<! and (?<= from (?<name> */
931 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
937 if (name == NULL && count == lorn) return count;
939 if (term == '<') term = '>';
941 while (*ptr != term) ptr++;
942 if (name != NULL && lorn == ptr - thisname &&
943 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
952 /*************************************************
953 * Find first significant op code *
954 *************************************************/
956 /* This is called by several functions that scan a compiled expression looking
957 for a fixed first character, or an anchoring op code etc. It skips over things
958 that do not influence this. For some calls, a change of option is important.
959 For some calls, it makes sense to skip negative forward and all backward
960 assertions, and also the \b assertion; for others it does not.
963 code pointer to the start of the group
964 options pointer to external options
965 optbit the option bit whose changing is significant, or
967 skipassert TRUE if certain assertions are to be skipped
969 Returns: pointer to the first significant opcode
973 first_significant_code(const uschar *code, int *options, int optbit,
981 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
982 *options = (int)code[1];
988 case OP_ASSERTBACK_NOT:
989 if (!skipassert) return code;
990 do code += GET(code, 1); while (*code == OP_ALT);
991 code += _pcre_OP_lengths[*code];
994 case OP_WORD_BOUNDARY:
995 case OP_NOT_WORD_BOUNDARY:
996 if (!skipassert) return code;
1003 code += _pcre_OP_lengths[*code];
1010 /* Control never reaches here */
1016 /*************************************************
1017 * Find the fixed length of a pattern *
1018 *************************************************/
1020 /* Scan a pattern and compute the fixed length of subject that will match it,
1021 if the length is fixed. This is needed for dealing with backward assertions.
1022 In UTF8 mode, the result is in characters rather than bytes.
1025 code points to the start of the pattern (the bracket)
1026 options the compiling options
1028 Returns: the fixed length, or -1 if there is no fixed length,
1029 or -2 if \C was encountered
1033 find_fixedlength(uschar *code, int options)
1037 register int branchlength = 0;
1038 register uschar *cc = code + 1 + LINK_SIZE;
1040 /* Scan along the opcodes for this branch. If we get to the end of the
1041 branch, check the length against that of the other branches. */
1046 register int op = *cc;
1054 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1055 if (d < 0) return d;
1057 do cc += GET(cc, 1); while (*cc == OP_ALT);
1058 cc += 1 + LINK_SIZE;
1061 /* Reached end of a branch; if it's a ket it is the end of a nested
1062 call. If it's ALT it is an alternation in a nested call. If it is
1063 END it's the end of the outer call. All can be handled by the same code. */
1070 if (length < 0) length = branchlength;
1071 else if (length != branchlength) return -1;
1072 if (*cc != OP_ALT) return length;
1073 cc += 1 + LINK_SIZE;
1077 /* Skip over assertive subpatterns */
1082 case OP_ASSERTBACK_NOT:
1083 do cc += GET(cc, 1); while (*cc == OP_ALT);
1086 /* Skip over things that don't match chars */
1100 case OP_NOT_WORD_BOUNDARY:
1101 case OP_WORD_BOUNDARY:
1102 cc += _pcre_OP_lengths[*cc];
1105 /* Handle literal characters */
1113 if ((options & PCRE_UTF8) != 0)
1115 while ((*cc & 0xc0) == 0x80) cc++;
1120 /* Handle exact repetitions. The count is already in characters, but we
1121 need to skip over a multibyte character in UTF8 mode. */
1124 branchlength += GET2(cc,1);
1127 if ((options & PCRE_UTF8) != 0)
1129 while((*cc & 0x80) == 0x80) cc++;
1135 branchlength += GET2(cc,1);
1139 /* Handle single-char matchers */
1148 case OP_NOT_WHITESPACE:
1150 case OP_NOT_WORDCHAR:
1157 /* The single-byte matcher isn't allowed */
1162 /* Check a class for variable quantification */
1166 cc += GET(cc, 1) - 33;
1184 if (GET2(cc,1) != GET2(cc,3)) return -1;
1185 branchlength += GET2(cc,1);
1194 /* Anything else is variable length */
1200 /* Control never gets here */
1206 /*************************************************
1207 * Scan compiled regex for numbered bracket *
1208 *************************************************/
1210 /* This little function scans through a compiled pattern until it finds a
1211 capturing bracket with the given number.
1214 code points to start of expression
1215 utf8 TRUE in UTF-8 mode
1216 number the required bracket number
1218 Returns: pointer to the opcode for the bracket, or NULL if not found
1221 static const uschar *
1222 find_bracket(const uschar *code, BOOL utf8, int number)
1226 register int c = *code;
1227 if (c == OP_END) return NULL;
1229 /* XCLASS is used for classes that cannot be represented just by a bit
1230 map. This includes negated single high-valued characters. The length in
1231 the table is zero; the actual length is stored in the compiled code. */
1233 if (c == OP_XCLASS) code += GET(code, 1);
1235 /* Handle capturing bracket */
1237 else if (c == OP_CBRA)
1239 int n = GET2(code, 1+LINK_SIZE);
1240 if (n == number) return (uschar *)code;
1241 code += _pcre_OP_lengths[c];
1244 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1245 a multi-byte character. The length in the table is a minimum, so we have to
1246 arrange to skip the extra bytes. */
1250 code += _pcre_OP_lengths[c];
1268 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1277 /*************************************************
1278 * Scan compiled regex for recursion reference *
1279 *************************************************/
1281 /* This little function scans through a compiled pattern until it finds an
1282 instance of OP_RECURSE.
1285 code points to start of expression
1286 utf8 TRUE in UTF-8 mode
1288 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1291 static const uschar *
1292 find_recurse(const uschar *code, BOOL utf8)
1296 register int c = *code;
1297 if (c == OP_END) return NULL;
1298 if (c == OP_RECURSE) return code;
1300 /* XCLASS is used for classes that cannot be represented just by a bit
1301 map. This includes negated single high-valued characters. The length in
1302 the table is zero; the actual length is stored in the compiled code. */
1304 if (c == OP_XCLASS) code += GET(code, 1);
1306 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1307 that are followed by a character may be followed by a multi-byte character.
1308 The length in the table is a minimum, so we have to arrange to skip the extra
1313 code += _pcre_OP_lengths[c];
1331 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1340 /*************************************************
1341 * Scan compiled branch for non-emptiness *
1342 *************************************************/
1344 /* This function scans through a branch of a compiled pattern to see whether it
1345 can match the empty string or not. It is called from could_be_empty()
1346 below and from compile_branch() when checking for an unlimited repeat of a
1347 group that can match nothing. Note that first_significant_code() skips over
1348 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1349 struck an inner bracket whose current branch will already have been scanned.
1352 code points to start of search
1353 endcode points to where to stop
1354 utf8 TRUE if in UTF8 mode
1356 Returns: TRUE if what is matched could be empty
1360 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1363 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1365 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1367 const uschar *ccode;
1371 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE)
1374 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1376 /* Scan a closed bracket */
1378 empty_branch = FALSE;
1381 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1382 empty_branch = TRUE;
1383 code += GET(code, 1);
1385 while (*code == OP_ALT);
1386 if (!empty_branch) return FALSE; /* All branches are non-empty */
1388 /* Move past the KET and fudge things so that the increment in the "for"
1389 above has no effect. */
1392 code += 1 + LINK_SIZE - _pcre_OP_lengths[c];
1396 /* Handle the other opcodes */
1400 /* Check for quantifiers after a class */
1404 ccode = code + GET(code, 1);
1405 goto CHECK_CLASS_REPEAT;
1418 case OP_CRSTAR: /* These could be empty; continue */
1424 default: /* Non-repeat => class must match */
1425 case OP_CRPLUS: /* These repeats aren't empty */
1431 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1436 /* Opcodes that must match a character */
1443 case OP_NOT_WHITESPACE:
1445 case OP_NOT_WORDCHAR:
1461 case OP_TYPEMINPLUS:
1462 case OP_TYPEPOSPLUS:
1474 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1475 MINUPTO, and POSUPTO may be followed by a multibyte character */
1487 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1498 /*************************************************
1499 * Scan compiled regex for non-emptiness *
1500 *************************************************/
1502 /* This function is called to check for left recursive calls. We want to check
1503 the current branch of the current pattern to see if it could match the empty
1504 string. If it could, we must look outwards for branches at other levels,
1505 stopping when we pass beyond the bracket which is the subject of the recursion.
1508 code points to start of the recursion
1509 endcode points to where to stop (current RECURSE item)
1510 bcptr points to the chain of current (unclosed) branch starts
1511 utf8 TRUE if in UTF-8 mode
1513 Returns: TRUE if what is matched could be empty
1517 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1520 while (bcptr != NULL && bcptr->current >= code)
1522 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1523 bcptr = bcptr->outer;
1530 /*************************************************
1531 * Check for POSIX class syntax *
1532 *************************************************/
1534 /* This function is called when the sequence "[:" or "[." or "[=" is
1535 encountered in a character class. It checks whether this is followed by an
1536 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1540 ptr pointer to the initial [
1541 endptr where to return the end pointer
1542 cd pointer to compile data
1544 Returns: TRUE or FALSE
1548 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1550 int terminator; /* Don't combine these lines; the Solaris cc */
1551 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1552 if (*(++ptr) == '^') ptr++;
1553 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1554 if (*ptr == terminator && ptr[1] == ']')
1565 /*************************************************
1566 * Check POSIX class name *
1567 *************************************************/
1569 /* This function is called to check the name given in a POSIX-style class entry
1573 ptr points to the first letter
1574 len the length of the name
1576 Returns: a value representing the name, or -1 if unknown
1580 check_posix_name(const uschar *ptr, int len)
1582 register int yield = 0;
1583 while (posix_name_lengths[yield] != 0)
1585 if (len == posix_name_lengths[yield] &&
1586 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1593 /*************************************************
1594 * Adjust OP_RECURSE items in repeated group *
1595 *************************************************/
1597 /* OP_RECURSE items contain an offset from the start of the regex to the group
1598 that is referenced. This means that groups can be replicated for fixed
1599 repetition simply by copying (because the recursion is allowed to refer to
1600 earlier groups that are outside the current group). However, when a group is
1601 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1602 it, after it has been compiled. This means that any OP_RECURSE items within it
1603 that refer to the group itself or any contained groups have to have their
1604 offsets adjusted. That one of the jobs of this function. Before it is called,
1605 the partially compiled regex must be temporarily terminated with OP_END.
1607 This function has been extended with the possibility of forward references for
1608 recursions and subroutine calls. It must also check the list of such references
1609 for the group we are dealing with. If it finds that one of the recursions in
1610 the current group is on this list, it adjusts the offset in the list, not the
1611 value in the reference (which is a group number).
1614 group points to the start of the group
1615 adjust the amount by which the group is to be moved
1616 utf8 TRUE in UTF-8 mode
1617 cd contains pointers to tables etc.
1618 save_hwm the hwm forward reference pointer at the start of the group
1624 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1627 uschar *ptr = group;
1628 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1633 /* See if this recursion is on the forward reference list. If so, adjust the
1636 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1638 offset = GET(hc, 0);
1639 if (cd->start_code + offset == ptr + 1)
1641 PUT(hc, 0, offset + adjust);
1646 /* Otherwise, adjust the recursion offset if it's after the start of this
1651 offset = GET(ptr, 1);
1652 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1655 ptr += 1 + LINK_SIZE;
1661 /*************************************************
1662 * Insert an automatic callout point *
1663 *************************************************/
1665 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1666 callout points before each pattern item.
1669 code current code pointer
1670 ptr current pattern pointer
1671 cd pointers to tables etc
1673 Returns: new code pointer
1677 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1679 *code++ = OP_CALLOUT;
1681 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1682 PUT(code, LINK_SIZE, 0); /* Default length */
1683 return code + 2*LINK_SIZE;
1688 /*************************************************
1689 * Complete a callout item *
1690 *************************************************/
1692 /* A callout item contains the length of the next item in the pattern, which
1693 we can't fill in till after we have reached the relevant point. This is used
1694 for both automatic and manual callouts.
1697 previous_callout points to previous callout item
1698 ptr current pattern pointer
1699 cd pointers to tables etc
1705 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1707 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1708 PUT(previous_callout, 2 + LINK_SIZE, length);
1714 /*************************************************
1715 * Get othercase range *
1716 *************************************************/
1718 /* This function is passed the start and end of a class range, in UTF-8 mode
1719 with UCP support. It searches up the characters, looking for internal ranges of
1720 characters in the "other" case. Each call returns the next one, updating the
1724 cptr points to starting character value; updated
1726 ocptr where to put start of othercase range
1727 odptr where to put end of othercase range
1729 Yield: TRUE when range returned; FALSE when no more
1733 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1734 unsigned int *odptr)
1736 unsigned int c, othercase, next;
1738 for (c = *cptr; c <= d; c++)
1739 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1741 if (c > d) return FALSE;
1744 next = othercase + 1;
1746 for (++c; c <= d; c++)
1748 if (_pcre_ucp_othercase(c) != next) break;
1757 #endif /* SUPPORT_UCP */
1761 /*************************************************
1762 * Check if auto-possessifying is possible *
1763 *************************************************/
1765 /* This function is called for unlimited repeats of certain items, to see
1766 whether the next thing could possibly match the repeated item. If not, it makes
1767 sense to automatically possessify the repeated item.
1770 op_code the repeated op code
1771 this data for this item, depends on the opcode
1772 utf8 TRUE in UTF-8 mode
1773 utf8_char used for utf8 character bytes, NULL if not relevant
1774 ptr next character in pattern
1775 options options bits
1776 cd contains pointers to tables etc.
1778 Returns: TRUE if possessifying is wanted
1782 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1783 const uschar *ptr, int options, compile_data *cd)
1787 /* Skip whitespace and comments in extended mode */
1789 if ((options & PCRE_EXTENDED) != 0)
1793 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1796 while (*(++ptr) != 0)
1797 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1803 /* If the next item is one that we can handle, get its value. A non-negative
1804 value is a character, a negative value is an escape value. */
1808 int temperrorcode = 0;
1809 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
1810 if (temperrorcode != 0) return FALSE;
1811 ptr++; /* Point after the escape sequence */
1814 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
1817 if (utf8) { GETCHARINC(next, ptr); } else
1824 /* Skip whitespace and comments in extended mode */
1826 if ((options & PCRE_EXTENDED) != 0)
1830 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1833 while (*(++ptr) != 0)
1834 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
1840 /* If the next thing is itself optional, we have to give up. */
1842 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
1845 /* Now compare the next item with the previous opcode. If the previous is a
1846 positive single character match, "item" either contains the character or, if
1847 "item" is greater than 127 in utf8 mode, the character's bytes are in
1851 /* Handle cases when the next item is a character. */
1853 if (next >= 0) switch(op_code)
1857 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1859 return item != next;
1861 /* For CHARNC (caseless character) we must check the other case. If we have
1862 Unicode property support, we can use it to test the other case of
1863 high-valued characters. */
1867 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1869 if (item == next) return FALSE;
1873 unsigned int othercase;
1874 if (next < 128) othercase = cd->fcc[next]; else
1876 othercase = _pcre_ucp_othercase((unsigned int)next);
1878 othercase = NOTACHAR;
1880 return (unsigned int)item != othercase;
1883 #endif /* SUPPORT_UTF8 */
1884 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
1886 /* For OP_NOT, "item" must be a single-byte character. */
1889 if (next < 0) return FALSE; /* Not a character */
1890 if (item == next) return TRUE;
1891 if ((options & PCRE_CASELESS) == 0) return FALSE;
1895 unsigned int othercase;
1896 if (next < 128) othercase = cd->fcc[next]; else
1898 othercase = _pcre_ucp_othercase(next);
1900 othercase = NOTACHAR;
1902 return (unsigned int)item == othercase;
1905 #endif /* SUPPORT_UTF8 */
1906 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
1909 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
1912 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
1915 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
1917 case OP_NOT_WHITESPACE:
1918 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
1921 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
1923 case OP_NOT_WORDCHAR:
1924 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
1931 /* Handle the case when the next item is \d, \s, etc. */
1938 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
1943 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
1946 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
1949 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
1952 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
1955 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
1958 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
1965 return next == -ESC_D || next == -ESC_s || next == -ESC_W;
1968 return next == -ESC_d;
1971 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
1973 case OP_NOT_WHITESPACE:
1974 return next == -ESC_s;
1977 return next == -ESC_W || next == -ESC_s;
1979 case OP_NOT_WORDCHAR:
1980 return next == -ESC_w || next == -ESC_d;
1986 /* Control does not reach here */
1991 /*************************************************
1992 * Compile one branch *
1993 *************************************************/
1995 /* Scan the pattern, compiling it into the a vector. If the options are
1996 changed during the branch, the pointer is used to change the external options
1997 bits. This function is used during the pre-compile phase when we are trying
1998 to find out the amount of memory needed, as well as during the real compile
1999 phase. The value of lengthptr distinguishes the two phases.
2002 optionsptr pointer to the option bits
2003 codeptr points to the pointer to the current code point
2004 ptrptr points to the current pattern pointer
2005 errorcodeptr points to error code variable
2006 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2007 reqbyteptr set to the last literal character required, else < 0
2008 bcptr points to current branch chain
2009 cd contains pointers to tables etc.
2010 lengthptr NULL during the real compile phase
2011 points to length accumulator during pre-compile phase
2013 Returns: TRUE on success
2014 FALSE, with *errorcodeptr set non-zero on error
2018 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2019 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2020 compile_data *cd, int *lengthptr)
2022 int repeat_type, op_type;
2023 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2025 int greedy_default, greedy_non_default;
2026 int firstbyte, reqbyte;
2027 int zeroreqbyte, zerofirstbyte;
2028 int req_caseopt, reqvary, tempreqvary;
2029 int options = *optionsptr;
2030 int after_manual_callout = 0;
2031 int length_prevgroup = 0;
2033 register uschar *code = *codeptr;
2034 uschar *last_code = code;
2035 uschar *orig_code = code;
2037 BOOL inescq = FALSE;
2038 BOOL groupsetfirstbyte = FALSE;
2039 const uschar *ptr = *ptrptr;
2040 const uschar *tempptr;
2041 uschar *previous = NULL;
2042 uschar *previous_callout = NULL;
2043 uschar *save_hwm = NULL;
2044 uschar classbits[32];
2048 BOOL utf8 = (options & PCRE_UTF8) != 0;
2049 uschar *class_utf8data;
2050 uschar utf8_char[6];
2053 uschar *utf8_char = NULL;
2057 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2060 /* Set up the default and non-default settings for greediness */
2062 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2063 greedy_non_default = greedy_default ^ 1;
2065 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2066 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2067 matches a non-fixed char first char; reqbyte just remains unset if we never
2070 When we hit a repeat whose minimum is zero, we may have to adjust these values
2071 to take the zero repeat into account. This is implemented by setting them to
2072 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2073 item types that can be repeated set these backoff variables appropriately. */
2075 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2077 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2078 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2079 value > 255. It is added into the firstbyte or reqbyte variables to record the
2080 case status of the value. This is used only for ASCII characters. */
2082 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2084 /* Switch on next character until the end of the branch */
2089 BOOL possessive_quantifier;
2092 int class_charcount;
2103 /* Get next byte in the pattern */
2107 /* If we are in the pre-compile phase, accumulate the length used for the
2108 previous cycle of this loop. */
2110 if (lengthptr != NULL)
2113 if (code > cd->hwm) cd->hwm = code; /* High water info */
2115 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2117 *errorcodeptr = ERR52;
2121 /* There is at least one situation where code goes backwards: this is the
2122 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2123 the class is simply eliminated. However, it is created first, so we have to
2124 allow memory for it. Therefore, don't ever reduce the length at this point.
2127 if (code < last_code) code = last_code;
2128 *lengthptr += code - last_code;
2129 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2131 /* If "previous" is set and it is not at the start of the work space, move
2132 it back to there, in order to avoid filling up the work space. Otherwise,
2133 if "previous" is NULL, reset the current code pointer to the start. */
2135 if (previous != NULL)
2137 if (previous > orig_code)
2139 memmove(orig_code, previous, code - previous);
2140 code -= previous - orig_code;
2141 previous = orig_code;
2144 else code = orig_code;
2146 /* Remember where this code item starts so we can pick up the length
2152 /* In the real compile phase, just check the workspace used by the forward
2155 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2157 *errorcodeptr = ERR52;
2161 /* If in \Q...\E, check for the end; if not, we have a literal */
2163 if (inescq && c != 0)
2165 if (c == '\\' && ptr[1] == 'E')
2173 if (previous_callout != NULL)
2175 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2176 complete_callout(previous_callout, ptr, cd);
2177 previous_callout = NULL;
2179 if ((options & PCRE_AUTO_CALLOUT) != 0)
2181 previous_callout = code;
2182 code = auto_callout(code, ptr, cd);
2188 /* Fill in length of a previous callout, except when the next thing is
2191 is_quantifier = c == '*' || c == '+' || c == '?' ||
2192 (c == '{' && is_counted_repeat(ptr+1));
2194 if (!is_quantifier && previous_callout != NULL &&
2195 after_manual_callout-- <= 0)
2197 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2198 complete_callout(previous_callout, ptr, cd);
2199 previous_callout = NULL;
2202 /* In extended mode, skip white space and comments */
2204 if ((options & PCRE_EXTENDED) != 0)
2206 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2209 while (*(++ptr) != 0)
2211 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2213 if (*ptr != 0) continue;
2215 /* Else fall through to handle end of string */
2220 /* No auto callout for quantifiers. */
2222 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2224 previous_callout = code;
2225 code = auto_callout(code, ptr, cd);
2230 /* ===================================================================*/
2231 case 0: /* The branch terminates at string end */
2232 case '|': /* or | or ) */
2234 *firstbyteptr = firstbyte;
2235 *reqbyteptr = reqbyte;
2238 if (lengthptr != NULL)
2240 *lengthptr += code - last_code; /* To include callout length */
2241 DPRINTF((">> end branch\n"));
2246 /* ===================================================================*/
2247 /* Handle single-character metacharacters. In multiline mode, ^ disables
2248 the setting of any following char as a first character. */
2251 if ((options & PCRE_MULTILINE) != 0)
2253 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2264 /* There can never be a first char if '.' is first, whatever happens about
2265 repeats. The value of reqbyte doesn't change either. */
2268 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2269 zerofirstbyte = firstbyte;
2270 zeroreqbyte = reqbyte;
2276 /* ===================================================================*/
2277 /* Character classes. If the included characters are all < 256, we build a
2278 32-byte bitmap of the permitted characters, except in the special case
2279 where there is only one such character. For negated classes, we build the
2280 map as usual, then invert it at the end. However, we use a different opcode
2281 so that data characters > 255 can be handled correctly.
2283 If the class contains characters outside the 0-255 range, a different
2284 opcode is compiled. It may optionally have a bit map for characters < 256,
2285 but those above are are explicitly listed afterwards. A flag byte tells
2286 whether the bitmap is present, and whether this is a negated class or not.
2292 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2293 they are encountered at the top level, so we'll do that too. */
2295 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2296 check_posix_syntax(ptr, &tempptr, cd))
2298 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2302 /* If the first character is '^', set the negation flag and skip it. */
2304 if ((c = *(++ptr)) == '^')
2306 negate_class = TRUE;
2311 negate_class = FALSE;
2314 /* Keep a count of chars with values < 256 so that we can optimize the case
2315 of just a single character (as long as it's < 256). However, For higher
2316 valued UTF-8 characters, we don't yet do any optimization. */
2318 class_charcount = 0;
2319 class_lastchar = -1;
2321 /* Initialize the 32-char bit map to all zeros. We build the map in a
2322 temporary bit of memory, in case the class contains only 1 character (less
2323 than 256), because in that case the compiled code doesn't use the bit map.
2326 memset(classbits, 0, 32 * sizeof(uschar));
2329 class_utf8 = FALSE; /* No chars >= 256 */
2330 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2333 /* Process characters until ] is reached. By writing this as a "do" it
2334 means that an initial ] is taken as a data character. At the start of the
2335 loop, c contains the first byte of the character. */
2339 const uschar *oldptr;
2342 if (utf8 && c > 127)
2343 { /* Braces are required because the */
2344 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2348 /* Inside \Q...\E everything is literal except \E */
2352 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2354 inescq = FALSE; /* Reset literal state */
2355 ptr++; /* Skip the 'E' */
2356 continue; /* Carry on with next */
2358 goto CHECK_RANGE; /* Could be range if \E follows */
2361 /* Handle POSIX class names. Perl allows a negation extension of the
2362 form [:^name:]. A square bracket that doesn't match the syntax is
2363 treated as a literal. We also recognize the POSIX constructions
2364 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2368 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2369 check_posix_syntax(ptr, &tempptr, cd))
2371 BOOL local_negate = FALSE;
2372 int posix_class, taboffset, tabopt;
2373 register const uschar *cbits = cd->cbits;
2378 *errorcodeptr = ERR31;
2385 local_negate = TRUE;
2389 posix_class = check_posix_name(ptr, tempptr - ptr);
2390 if (posix_class < 0)
2392 *errorcodeptr = ERR30;
2396 /* If matching is caseless, upper and lower are converted to
2397 alpha. This relies on the fact that the class table starts with
2398 alpha, lower, upper as the first 3 entries. */
2400 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2403 /* We build the bit map for the POSIX class in a chunk of local store
2404 because we may be adding and subtracting from it, and we don't want to
2405 subtract bits that may be in the main map already. At the end we or the
2406 result into the bit map that is being built. */
2410 /* Copy in the first table (always present) */
2412 memcpy(pbits, cbits + posix_class_maps[posix_class],
2413 32 * sizeof(uschar));
2415 /* If there is a second table, add or remove it as required. */
2417 taboffset = posix_class_maps[posix_class + 1];
2418 tabopt = posix_class_maps[posix_class + 2];
2423 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2425 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2428 /* Not see if we need to remove any special characters. An option
2429 value of 1 removes vertical space and 2 removes underscore. */
2431 if (tabopt < 0) tabopt = -tabopt;
2432 if (tabopt == 1) pbits[1] &= ~0x3c;
2433 else if (tabopt == 2) pbits[11] &= 0x7f;
2435 /* Add the POSIX table or its complement into the main table that is
2436 being built and we are done. */
2439 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2441 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2444 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2445 continue; /* End of POSIX syntax handling */
2448 /* Backslash may introduce a single character, or it may introduce one
2449 of the specials, which just set a flag. The sequence \b is a special
2450 case. Inside a class (and only there) it is treated as backspace.
2451 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2452 to or into the one we are building. We assume they have more than one
2453 character in them, so set class_charcount bigger than one. */
2457 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2458 if (*errorcodeptr != 0) goto FAILED;
2460 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2461 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2462 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2463 else if (-c == ESC_Q) /* Handle start of quoted string */
2465 if (ptr[1] == '\\' && ptr[2] == 'E')
2467 ptr += 2; /* avoid empty string */
2475 register const uschar *cbits = cd->cbits;
2476 class_charcount += 2; /* Greater than 1 is what matters */
2478 /* Save time by not doing this in the pre-compile phase. */
2480 if (lengthptr == NULL) switch (-c)
2483 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2487 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2491 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2495 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2499 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2500 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2504 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2505 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2508 case ESC_E: /* Perl ignores an orphan \E */
2511 default: /* Not recognized; fall through */
2512 break; /* Need "default" setting to stop compiler warning. */
2515 /* In the pre-compile phase, just do the recognition. */
2517 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2518 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2520 /* We need to deal with \P and \p in both phases. */
2523 if (-c == ESC_p || -c == ESC_P)
2527 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2528 if (ptype < 0) goto FAILED;
2530 *class_utf8data++ = ((-c == ESC_p) != negated)?
2531 XCL_PROP : XCL_NOTPROP;
2532 *class_utf8data++ = ptype;
2533 *class_utf8data++ = pdata;
2534 class_charcount -= 2; /* Not a < 256 character */
2538 /* Unrecognized escapes are faulted if PCRE is running in its
2539 strict mode. By default, for compatibility with Perl, they are
2540 treated as literals. */
2542 if ((options & PCRE_EXTRA) != 0)
2544 *errorcodeptr = ERR7;
2548 class_charcount -= 2; /* Undo the default count from above */
2549 c = *ptr; /* Get the final character and fall through */
2552 /* Fall through if we have a single character (c >= 0). This may be
2553 greater than 256 in UTF-8 mode. */
2555 } /* End of backslash handling */
2557 /* A single character may be followed by '-' to form a range. However,
2558 Perl does not permit ']' to be the end of the range. A '-' character
2559 at the end is treated as a literal. Perl ignores orphaned \E sequences
2560 entirely. The code for handling \Q and \E is messy. */
2563 while (ptr[1] == '\\' && ptr[2] == 'E')
2571 if (!inescq && ptr[1] == '-')
2575 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
2577 /* If we hit \Q (not followed by \E) at this point, go into escaped
2580 while (*ptr == '\\' && ptr[1] == 'Q')
2583 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
2588 if (*ptr == 0 || (!inescq && *ptr == ']'))
2591 goto LONE_SINGLE_CHARACTER;
2596 { /* Braces are required because the */
2597 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2601 d = *ptr; /* Not UTF-8 mode */
2603 /* The second part of a range can be a single-character escape, but
2604 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2605 in such circumstances. */
2607 if (!inescq && d == '\\')
2609 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2610 if (*errorcodeptr != 0) goto FAILED;
2612 /* \b is backslash; \X is literal X; \R is literal R; any other
2613 special means the '-' was literal */
2617 if (d == -ESC_b) d = '\b';
2618 else if (d == -ESC_X) d = 'X';
2619 else if (d == -ESC_R) d = 'R'; else
2622 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2627 /* Check that the two values are in the correct order. Optimize
2628 one-character ranges */
2632 *errorcodeptr = ERR8;
2636 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2638 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2639 matching, we have to use an XCLASS with extra data items. Caseless
2640 matching for characters > 127 is available only if UCP support is
2644 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2648 /* With UCP support, we can find the other case equivalents of
2649 the relevant characters. There may be several ranges. Optimize how
2650 they fit with the basic range. */
2653 if ((options & PCRE_CASELESS) != 0)
2655 unsigned int occ, ocd;
2656 unsigned int cc = c;
2657 unsigned int origd = d;
2658 while (get_othercase_range(&cc, origd, &occ, &ocd))
2660 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2662 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2663 { /* if there is overlap, */
2664 c = occ; /* noting that if occ < c */
2665 continue; /* we can't have ocd > d */
2666 } /* because a subrange is */
2667 if (ocd > d && occ <= d + 1) /* always shorter than */
2668 { /* the basic range. */
2675 *class_utf8data++ = XCL_SINGLE;
2679 *class_utf8data++ = XCL_RANGE;
2680 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2682 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2685 #endif /* SUPPORT_UCP */
2687 /* Now record the original range, possibly modified for UCP caseless
2688 overlapping ranges. */
2690 *class_utf8data++ = XCL_RANGE;
2691 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2692 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2694 /* With UCP support, we are done. Without UCP support, there is no
2695 caseless matching for UTF-8 characters > 127; we can use the bit map
2696 for the smaller ones. */
2699 continue; /* With next character in the class */
2701 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2703 /* Adjust upper limit and fall through to set up the map */
2707 #endif /* SUPPORT_UCP */
2709 #endif /* SUPPORT_UTF8 */
2711 /* We use the bit map for all cases when not in UTF-8 mode; else
2712 ranges that lie entirely within 0-127 when there is UCP support; else
2713 for partial ranges without UCP support. */
2715 class_charcount += d - c + 1;
2718 /* We can save a bit of time by skipping this in the pre-compile. */
2720 if (lengthptr == NULL) for (; c <= d; c++)
2722 classbits[c/8] |= (1 << (c&7));
2723 if ((options & PCRE_CASELESS) != 0)
2725 int uc = cd->fcc[c]; /* flip case */
2726 classbits[uc/8] |= (1 << (uc&7));
2730 continue; /* Go get the next char in the class */
2733 /* Handle a lone single character - we can get here for a normal
2734 non-escape char, or after \ that introduces a single character or for an
2735 apparent range that isn't. */
2737 LONE_SINGLE_CHARACTER:
2739 /* Handle a character that cannot go in the bit map */
2742 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2745 *class_utf8data++ = XCL_SINGLE;
2746 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2749 if ((options & PCRE_CASELESS) != 0)
2751 unsigned int othercase;
2752 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
2754 *class_utf8data++ = XCL_SINGLE;
2755 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2758 #endif /* SUPPORT_UCP */
2762 #endif /* SUPPORT_UTF8 */
2764 /* Handle a single-byte character */
2766 classbits[c/8] |= (1 << (c&7));
2767 if ((options & PCRE_CASELESS) != 0)
2769 c = cd->fcc[c]; /* flip case */
2770 classbits[c/8] |= (1 << (c&7));
2777 /* Loop until ']' reached. This "while" is the end of the "do" above. */
2779 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
2781 if (c == 0) /* Missing terminating ']' */
2783 *errorcodeptr = ERR6;
2787 /* If class_charcount is 1, we saw precisely one character whose value is
2788 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2789 can optimize the negative case only if there were no characters >= 128
2790 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2791 single-bytes only. This is an historical hangover. Maybe one day we can
2792 tidy these opcodes to handle multi-byte characters.
2794 The optimization throws away the bit map. We turn the item into a
2795 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2796 that OP_NOT does not support multibyte characters. In the positive case, it
2797 can cause firstbyte to be set. Otherwise, there can be no first char if
2798 this item is first, whatever repeat count may follow. In the case of
2799 reqbyte, save the previous value for reinstating. */
2802 if (class_charcount == 1 &&
2804 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2807 if (class_charcount == 1)
2810 zeroreqbyte = reqbyte;
2812 /* The OP_NOT opcode works on one-byte characters only. */
2816 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2817 zerofirstbyte = firstbyte;
2819 *code++ = class_lastchar;
2823 /* For a single, positive character, get the value into mcbuffer, and
2824 then we can handle this with the normal one-character code. */
2827 if (utf8 && class_lastchar > 127)
2828 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2832 mcbuffer[0] = class_lastchar;
2836 } /* End of 1-char optimization */
2838 /* The general case - not the one-char optimization. If this is the first
2839 thing in the branch, there can be no first char setting, whatever the
2840 repeat count. Any reqbyte setting must remain unchanged after any kind of
2843 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2844 zerofirstbyte = firstbyte;
2845 zeroreqbyte = reqbyte;
2847 /* If there are characters with values > 255, we have to compile an
2848 extended class, with its own opcode. If there are no characters < 256,
2849 we can omit the bitmap in the actual compiled code. */
2854 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2855 *code++ = OP_XCLASS;
2857 *code = negate_class? XCL_NOT : 0;
2859 /* If the map is required, move up the extra data to make room for it;
2860 otherwise just move the code pointer to the end of the extra data. */
2862 if (class_charcount > 0)
2865 memmove(code + 32, code, class_utf8data - code);
2866 memcpy(code, classbits, 32);
2867 code = class_utf8data + 32;
2869 else code = class_utf8data;
2871 /* Now fill in the complete length of the item */
2873 PUT(previous, 1, code - previous);
2874 break; /* End of class handling */
2878 /* If there are no characters > 255, negate the 32-byte map if necessary,
2879 and copy it into the code vector. If this is the first thing in the branch,
2880 there can be no first char setting, whatever the repeat count. Any reqbyte
2881 setting must remain unchanged after any kind of repeat. */
2885 *code++ = OP_NCLASS;
2886 if (lengthptr == NULL) /* Save time in the pre-compile phase */
2887 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2892 memcpy(code, classbits, 32);
2898 /* ===================================================================*/
2899 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2900 has been tested above. */
2903 if (!is_quantifier) goto NORMAL_CHAR;
2904 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2905 if (*errorcodeptr != 0) goto FAILED;
2923 if (previous == NULL)
2925 *errorcodeptr = ERR9;
2929 if (repeat_min == 0)
2931 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2932 reqbyte = zeroreqbyte; /* Ditto */
2935 /* Remember whether this is a variable length repeat */
2937 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2939 op_type = 0; /* Default single-char op codes */
2940 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2942 /* Save start of previous item, in case we have to move it up to make space
2943 for an inserted OP_ONCE for the additional '+' extension. */
2945 tempcode = previous;
2947 /* If the next character is '+', we have a possessive quantifier. This
2948 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2949 If the next character is '?' this is a minimizing repeat, by default,
2950 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2951 repeat type to the non-default. */
2955 repeat_type = 0; /* Force greedy */
2956 possessive_quantifier = TRUE;
2959 else if (ptr[1] == '?')
2961 repeat_type = greedy_non_default;
2964 else repeat_type = greedy_default;
2966 /* If previous was a character match, abolish the item and generate a
2967 repeat item instead. If a char item has a minumum of more than one, ensure
2968 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2969 the first thing in a branch because the x will have gone into firstbyte
2972 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2974 /* Deal with UTF-8 characters that take up more than one byte. It's
2975 easier to write this out separately than try to macrify it. Use c to
2976 hold the length of the character in bytes, plus 0x80 to flag that it's a
2977 length rather than a small character. */
2980 if (utf8 && (code[-1] & 0x80) != 0)
2982 uschar *lastchar = code - 1;
2983 while((*lastchar & 0xc0) == 0x80) lastchar--;
2984 c = code - lastchar; /* Length of UTF-8 character */
2985 memcpy(utf8_char, lastchar, c); /* Save the char */
2986 c |= 0x80; /* Flag c as a length */
2991 /* Handle the case of a single byte - either with no UTF8 support, or
2992 with UTF-8 disabled, or for a UTF-8 character < 128. */
2996 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2999 /* If the repetition is unlimited, it pays to see if the next thing on
3000 the line is something that cannot possibly match this character. If so,
3001 automatically possessifying this item gains some performance in the case
3002 where the match fails. */
3004 if (!possessive_quantifier &&
3006 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3009 repeat_type = 0; /* Force greedy */
3010 possessive_quantifier = TRUE;
3013 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3016 /* If previous was a single negated character ([^a] or similar), we use
3017 one of the special opcodes, replacing it. The code is shared with single-
3018 character repeats by setting opt_type to add a suitable offset into
3019 repeat_type. We can also test for auto-possessification. OP_NOT is
3020 currently used only for single-byte chars. */
3022 else if (*previous == OP_NOT)
3024 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3026 if (!possessive_quantifier &&
3028 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3030 repeat_type = 0; /* Force greedy */
3031 possessive_quantifier = TRUE;
3033 goto OUTPUT_SINGLE_REPEAT;
3036 /* If previous was a character type match (\d or similar), abolish it and
3037 create a suitable repeat item. The code is shared with single-character
3038 repeats by setting op_type to add a suitable offset into repeat_type. Note
3039 the the Unicode property types will be present only when SUPPORT_UCP is
3040 defined, but we don't wrap the little bits of code here because it just
3041 makes it horribly messy. */
3043 else if (*previous < OP_EODN)
3046 int prop_type, prop_value;
3047 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3050 if (!possessive_quantifier &&
3052 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3054 repeat_type = 0; /* Force greedy */
3055 possessive_quantifier = TRUE;
3058 OUTPUT_SINGLE_REPEAT:
3059 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3061 prop_type = previous[1];
3062 prop_value = previous[2];
3064 else prop_type = prop_value = -1;
3067 code = previous; /* Usually overwrite previous item */
3069 /* If the maximum is zero then the minimum must also be zero; Perl allows
3070 this case, so we do too - by simply omitting the item altogether. */
3072 if (repeat_max == 0) goto END_REPEAT;
3074 /* All real repeats make it impossible to handle partial matching (maybe
3075 one day we will be able to remove this restriction). */
3077 if (repeat_max != 1) cd->nopartial = TRUE;
3079 /* Combine the op_type with the repeat_type */
3081 repeat_type += op_type;
3083 /* A minimum of zero is handled either as the special case * or ?, or as
3084 an UPTO, with the maximum given. */
3086 if (repeat_min == 0)
3088 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3089 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3092 *code++ = OP_UPTO + repeat_type;
3093 PUT2INC(code, 0, repeat_max);
3097 /* A repeat minimum of 1 is optimized into some special cases. If the
3098 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3099 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3100 one less than the maximum. */
3102 else if (repeat_min == 1)
3104 if (repeat_max == -1)
3105 *code++ = OP_PLUS + repeat_type;
3108 code = oldcode; /* leave previous item in place */
3109 if (repeat_max == 1) goto END_REPEAT;
3110 *code++ = OP_UPTO + repeat_type;
3111 PUT2INC(code, 0, repeat_max - 1);
3115 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3116 handled as an EXACT followed by an UPTO. */
3120 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3121 PUT2INC(code, 0, repeat_min);
3123 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3124 we have to insert the character for the previous code. For a repeated
3125 Unicode property match, there are two extra bytes that define the
3126 required property. In UTF-8 mode, long characters have their length in
3127 c, with the 0x80 bit as a flag. */
3132 if (utf8 && c >= 128)
3134 memcpy(code, utf8_char, c & 7);
3143 *code++ = prop_type;
3144 *code++ = prop_value;
3147 *code++ = OP_STAR + repeat_type;
3150 /* Else insert an UPTO if the max is greater than the min, again
3151 preceded by the character, for the previously inserted code. If the
3152 UPTO is just for 1 instance, we can use QUERY instead. */
3154 else if (repeat_max != repeat_min)
3157 if (utf8 && c >= 128)
3159 memcpy(code, utf8_char, c & 7);
3167 *code++ = prop_type;
3168 *code++ = prop_value;
3170 repeat_max -= repeat_min;
3172 if (repeat_max == 1)
3174 *code++ = OP_QUERY + repeat_type;
3178 *code++ = OP_UPTO + repeat_type;
3179 PUT2INC(code, 0, repeat_max);
3184 /* The character or character type itself comes last in all cases. */
3187 if (utf8 && c >= 128)
3189 memcpy(code, utf8_char, c & 7);
3196 /* For a repeated Unicode property match, there are two extra bytes that
3197 define the required property. */
3202 *code++ = prop_type;
3203 *code++ = prop_value;
3208 /* If previous was a character class or a back reference, we put the repeat
3209 stuff after it, but just skip the item if the repeat was {0,0}. */
3211 else if (*previous == OP_CLASS ||
3212 *previous == OP_NCLASS ||
3214 *previous == OP_XCLASS ||
3216 *previous == OP_REF)
3218 if (repeat_max == 0)
3224 /* All real repeats make it impossible to handle partial matching (maybe
3225 one day we will be able to remove this restriction). */
3227 if (repeat_max != 1) cd->nopartial = TRUE;
3229 if (repeat_min == 0 && repeat_max == -1)
3230 *code++ = OP_CRSTAR + repeat_type;
3231 else if (repeat_min == 1 && repeat_max == -1)
3232 *code++ = OP_CRPLUS + repeat_type;
3233 else if (repeat_min == 0 && repeat_max == 1)
3234 *code++ = OP_CRQUERY + repeat_type;
3237 *code++ = OP_CRRANGE + repeat_type;
3238 PUT2INC(code, 0, repeat_min);
3239 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3240 PUT2INC(code, 0, repeat_max);
3244 /* If previous was a bracket group, we may have to replicate it in certain
3247 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3248 *previous == OP_ONCE || *previous == OP_COND)
3252 int len = code - previous;
3253 uschar *bralink = NULL;
3255 /* Repeating a DEFINE group is pointless */
3257 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3259 *errorcodeptr = ERR55;
3263 /* This is a paranoid check to stop integer overflow later on */
3265 if (len > MAX_DUPLENGTH)
3267 *errorcodeptr = ERR50;
3271 /* If the maximum repeat count is unlimited, find the end of the bracket
3272 by scanning through from the start, and compute the offset back to it
3273 from the current code pointer. There may be an OP_OPT setting following
3274 the final KET, so we can't find the end just by going back from the code
3277 if (repeat_max == -1)
3279 register uschar *ket = previous;
3280 do ket += GET(ket, 1); while (*ket != OP_KET);
3281 ketoffset = code - ket;
3284 /* The case of a zero minimum is special because of the need to stick
3285 OP_BRAZERO in front of it, and because the group appears once in the
3286 data, whereas in other cases it appears the minimum number of times. For
3287 this reason, it is simplest to treat this case separately, as otherwise
3288 the code gets far too messy. There are several special subcases when the
3291 if (repeat_min == 0)
3293 /* If the maximum is also zero, we just omit the group from the output
3296 if (repeat_max == 0)
3302 /* If the maximum is 1 or unlimited, we just have to stick in the
3303 BRAZERO and do no more at this point. However, we do need to adjust
3304 any OP_RECURSE calls inside the group that refer to the group itself or
3305 any internal or forward referenced group, because the offset is from
3306 the start of the whole regex. Temporarily terminate the pattern while
3309 if (repeat_max <= 1)
3312 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3313 memmove(previous+1, previous, len);
3315 *previous++ = OP_BRAZERO + repeat_type;
3318 /* If the maximum is greater than 1 and limited, we have to replicate
3319 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3320 The first one has to be handled carefully because it's the original
3321 copy, which has to be moved up. The remainder can be handled by code
3322 that is common with the non-zero minimum case below. We have to
3323 adjust the value or repeat_max, since one less copy is required. Once
3324 again, we may have to adjust any OP_RECURSE calls inside the group. */
3330 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3331 memmove(previous + 2 + LINK_SIZE, previous, len);
3332 code += 2 + LINK_SIZE;
3333 *previous++ = OP_BRAZERO + repeat_type;
3334 *previous++ = OP_BRA;
3336 /* We chain together the bracket offset fields that have to be
3337 filled in later when the ends of the brackets are reached. */
3339 offset = (bralink == NULL)? 0 : previous - bralink;
3341 PUTINC(previous, 0, offset);
3347 /* If the minimum is greater than zero, replicate the group as many
3348 times as necessary, and adjust the maximum to the number of subsequent
3349 copies that we need. If we set a first char from the group, and didn't
3350 set a required char, copy the latter from the former. If there are any
3351 forward reference subroutine calls in the group, there will be entries on
3352 the workspace list; replicate these with an appropriate increment. */
3358 /* In the pre-compile phase, we don't actually do the replication. We
3359 just adjust the length as if we had. */
3361 if (lengthptr != NULL)
3362 *lengthptr += (repeat_min - 1)*length_prevgroup;
3364 /* This is compiling for real */
3368 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3369 for (i = 1; i < repeat_min; i++)
3372 uschar *this_hwm = cd->hwm;
3373 memcpy(code, previous, len);
3374 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3376 PUT(cd->hwm, 0, GET(hc, 0) + len);
3377 cd->hwm += LINK_SIZE;
3379 save_hwm = this_hwm;
3385 if (repeat_max > 0) repeat_max -= repeat_min;
3388 /* This code is common to both the zero and non-zero minimum cases. If
3389 the maximum is limited, it replicates the group in a nested fashion,
3390 remembering the bracket starts on a stack. In the case of a zero minimum,
3391 the first one was set up above. In all cases the repeat_max now specifies
3392 the number of additional copies needed. Again, we must remember to
3393 replicate entries on the forward reference list. */
3395 if (repeat_max >= 0)
3397 /* In the pre-compile phase, we don't actually do the replication. We
3398 just adjust the length as if we had. For each repetition we must add 1
3399 to the length for BRAZERO and for all but the last repetition we must
3400 add 2 + 2*LINKSIZE to allow for the nesting that occurs. */
3402 if (lengthptr != NULL && repeat_max > 0)
3403 *lengthptr += repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3404 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3406 /* This is compiling for real */
3408 else for (i = repeat_max - 1; i >= 0; i--)
3411 uschar *this_hwm = cd->hwm;
3413 *code++ = OP_BRAZERO + repeat_type;
3415 /* All but the final copy start a new nesting, maintaining the
3416 chain of brackets outstanding. */
3422 offset = (bralink == NULL)? 0 : code - bralink;
3424 PUTINC(code, 0, offset);
3427 memcpy(code, previous, len);
3428 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3430 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3431 cd->hwm += LINK_SIZE;
3433 save_hwm = this_hwm;
3437 /* Now chain through the pending brackets, and fill in their length
3438 fields (which are holding the chain links pro tem). */
3440 while (bralink != NULL)
3443 int offset = code - bralink + 1;
3444 uschar *bra = code - offset;
3445 oldlinkoffset = GET(bra, 1);
3446 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3448 PUTINC(code, 0, offset);
3449 PUT(bra, 1, offset);
3453 /* If the maximum is unlimited, set a repeater in the final copy. We
3454 can't just offset backwards from the current code point, because we
3455 don't know if there's been an options resetting after the ket. The
3456 correct offset was computed above.
3458 Then, when we are doing the actual compile phase, check to see whether
3459 this group is a non-atomic one that could match an empty string. If so,
3460 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3461 that runtime checking can be done. [This check is also applied to
3462 atomic groups at runtime, but in a different way.] */
3466 uschar *ketcode = code - ketoffset;
3467 uschar *bracode = ketcode - GET(ketcode, 1);
3468 *ketcode = OP_KETRMAX + repeat_type;
3469 if (lengthptr == NULL && *bracode != OP_ONCE)
3471 uschar *scode = bracode;
3474 if (could_be_empty_branch(scode, ketcode, utf8))
3476 *bracode += OP_SBRA - OP_BRA;
3479 scode += GET(scode, 1);
3481 while (*scode == OP_ALT);
3486 /* Else there's some kind of shambles */
3490 *errorcodeptr = ERR11;
3494 /* If the character following a repeat is '+', or if certain optimization
3495 tests above succeeded, possessive_quantifier is TRUE. For some of the
3496 simpler opcodes, there is an special alternative opcode for this. For
3497 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3498 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3499 but the special opcodes can optimize it a bit. The repeated item starts at
3500 tempcode, not at previous, which might be the first part of a string whose
3501 (former) last char we repeated.
3503 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
3504 an 'upto' may follow. We skip over an 'exact' item, and then test the
3505 length of what remains before proceeding. */
3507 if (possessive_quantifier)
3510 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
3511 *tempcode == OP_NOTEXACT)
3512 tempcode += _pcre_OP_lengths[*tempcode];
3513 len = code - tempcode;
3514 if (len > 0) switch (*tempcode)
3516 case OP_STAR: *tempcode = OP_POSSTAR; break;
3517 case OP_PLUS: *tempcode = OP_POSPLUS; break;
3518 case OP_QUERY: *tempcode = OP_POSQUERY; break;
3519 case OP_UPTO: *tempcode = OP_POSUPTO; break;
3521 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
3522 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
3523 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
3524 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
3526 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
3527 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
3528 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
3529 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
3532 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3533 code += 1 + LINK_SIZE;
3534 len += 1 + LINK_SIZE;
3535 tempcode[0] = OP_ONCE;
3537 PUTINC(code, 0, len);
3538 PUT(tempcode, 1, len);
3543 /* In all case we no longer have a previous item. We also set the
3544 "follows varying string" flag for subsequently encountered reqbytes if
3545 it isn't already set and we have just passed a varying length item. */
3549 cd->req_varyopt |= reqvary;
3553 /* ===================================================================*/
3554 /* Start of nested parenthesized sub-expression, or comment or lookahead or
3555 lookbehind or option setting or condition or all the other extended
3556 parenthesis forms. First deal with the specials; all are introduced by ?,
3557 and the appearance of any of them means that this is not a capturing
3561 newoptions = options;
3566 if (*(++ptr) == '?')
3568 int i, set, unset, namelen;
3575 case '#': /* Comment; skip to ket */
3577 while (*ptr != 0 && *ptr != ')') ptr++;
3580 *errorcodeptr = ERR18;
3586 /* ------------------------------------------------------------ */
3587 case ':': /* Non-capturing bracket */
3593 /* ------------------------------------------------------------ */
3595 bravalue = OP_COND; /* Conditional group */
3597 /* A condition can be an assertion, a number (referring to a numbered
3598 group), a name (referring to a named group), or 'R', referring to
3599 recursion. R<digits> and R&name are also permitted for recursion tests.
3601 There are several syntaxes for testing a named group: (?(name)) is used
3602 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
3604 There are two unfortunate ambiguities, caused by history. (a) 'R' can
3605 be the recursive thing or the name 'R' (and similarly for 'R' followed
3606 by digits), and (b) a number could be a name that consists of digits.
3607 In both cases, we look for a name first; if not found, we try the other
3610 /* For conditions that are assertions, check the syntax, and then exit
3611 the switch. This will take control down to where bracketed groups,
3612 including assertions, are processed. */
3614 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
3617 /* Most other conditions use OP_CREF (a couple change to OP_RREF
3618 below), and all need to skip 3 bytes at the start of the group. */
3620 code[1+LINK_SIZE] = OP_CREF;
3623 /* Check for a test for recursion in a named group. */
3625 if (ptr[1] == 'R' && ptr[2] == '&')
3629 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
3632 /* Check for a test for a named group's having been set, using the Perl
3633 syntax (?(<name>) or (?('name') */
3635 else if (ptr[1] == '<')
3640 else if (ptr[1] == '\'')
3645 else terminator = 0;
3647 /* We now expect to read a name; any thing else is an error */
3649 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
3651 ptr += 1; /* To get the right offset */
3652 *errorcodeptr = ERR28;
3656 /* Read the name, but also get it as a number if it's all digits */
3660 while ((cd->ctypes[*ptr] & ctype_word) != 0)
3663 recno = ((digitab[*ptr] & ctype_digit) != 0)?
3664 recno * 10 + *ptr - '0' : -1;
3667 namelen = ptr - name;
3669 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
3671 ptr--; /* Error offset */
3672 *errorcodeptr = ERR26;
3676 /* Do no further checking in the pre-compile phase. */
3678 if (lengthptr != NULL) break;
3680 /* In the real compile we do the work of looking for the actual
3683 slot = cd->name_table;
3684 for (i = 0; i < cd->names_found; i++)
3686 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3687 slot += cd->name_entry_size;
3690 /* Found a previous named subpattern */
3692 if (i < cd->names_found)
3694 recno = GET2(slot, 0);
3695 PUT2(code, 2+LINK_SIZE, recno);
3698 /* Search the pattern for a forward reference */
3700 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
3701 (options & PCRE_EXTENDED) != 0)) > 0)
3703 PUT2(code, 2+LINK_SIZE, i);
3706 /* If terminator == 0 it means that the name followed directly after
3707 the opening parenthesis [e.g. (?(abc)...] and in this case there are
3708 some further alternatives to try. For the cases where terminator != 0
3709 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
3710 now checked all the possibilities, so give an error. */
3712 else if (terminator != 0)
3714 *errorcodeptr = ERR15;
3718 /* Check for (?(R) for recursion. Allow digits after R to specify a
3719 specific group number. */
3721 else if (*name == 'R')
3724 for (i = 1; i < namelen; i++)
3726 if ((digitab[name[i]] & ctype_digit) == 0)
3728 *errorcodeptr = ERR15;
3731 recno = recno * 10 + name[i] - '0';
3733 if (recno == 0) recno = RREF_ANY;
3734 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
3735 PUT2(code, 2+LINK_SIZE, recno);
3738 /* Similarly, check for the (?(DEFINE) "condition", which is always
3741 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
3743 code[1+LINK_SIZE] = OP_DEF;
3747 /* Check for the "name" actually being a subpattern number. */
3751 PUT2(code, 2+LINK_SIZE, recno);
3754 /* Either an unidentified subpattern, or a reference to (?(0) */
3758 *errorcodeptr = (recno == 0)? ERR35: ERR15;
3764 /* ------------------------------------------------------------ */
3765 case '=': /* Positive lookahead */
3766 bravalue = OP_ASSERT;
3771 /* ------------------------------------------------------------ */
3772 case '!': /* Negative lookahead */
3773 bravalue = OP_ASSERT_NOT;
3778 /* ------------------------------------------------------------ */
3779 case '<': /* Lookbehind or named define */
3782 case '=': /* Positive lookbehind */
3783 bravalue = OP_ASSERTBACK;
3787 case '!': /* Negative lookbehind */
3788 bravalue = OP_ASSERTBACK_NOT;
3792 default: /* Could be name define, else bad */
3793 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
3794 ptr++; /* Correct offset for error */
3795 *errorcodeptr = ERR24;
3801 /* ------------------------------------------------------------ */
3802 case '>': /* One-time brackets */
3808 /* ------------------------------------------------------------ */
3809 case 'C': /* Callout - may be followed by digits; */
3810 previous_callout = code; /* Save for later completion */
3811 after_manual_callout = 1; /* Skip one item before completing */
3812 *code++ = OP_CALLOUT;
3815 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3816 n = n * 10 + *ptr - '0';
3819 *errorcodeptr = ERR39;
3824 *errorcodeptr = ERR38;
3828 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3829 PUT(code, LINK_SIZE, 0); /* Default length */
3830 code += 2 * LINK_SIZE;
3836 /* ------------------------------------------------------------ */
3837 case 'P': /* Python-style named subpattern handling */
3838 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
3840 is_recurse = *ptr == '>';
3842 goto NAMED_REF_OR_RECURSE;
3844 else if (*ptr != '<') /* Test for Python-style definition */
3846 *errorcodeptr = ERR41;
3849 /* Fall through to handle (?P< as (?< is handled */
3852 /* ------------------------------------------------------------ */
3853 DEFINE_NAME: /* Come here from (?< handling */
3856 terminator = (*ptr == '<')? '>' : '\'';
3859 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3860 namelen = ptr - name;
3862 /* In the pre-compile phase, just do a syntax check. */
3864 if (lengthptr != NULL)
3866 if (*ptr != terminator)
3868 *errorcodeptr = ERR42;
3871 if (cd->names_found >= MAX_NAME_COUNT)
3873 *errorcodeptr = ERR49;
3876 if (namelen + 3 > cd->name_entry_size)
3878 cd->name_entry_size = namelen + 3;
3879 if (namelen > MAX_NAME_SIZE)
3881 *errorcodeptr = ERR48;
3887 /* In the real compile, create the entry in the table */
3891 slot = cd->name_table;
3892 for (i = 0; i < cd->names_found; i++)
3894 int crc = memcmp(name, slot+2, namelen);
3897 if (slot[2+namelen] == 0)
3899 if ((options & PCRE_DUPNAMES) == 0)
3901 *errorcodeptr = ERR43;
3905 else crc = -1; /* Current name is substring */
3909 memmove(slot + cd->name_entry_size, slot,
3910 (cd->names_found - i) * cd->name_entry_size);
3913 slot += cd->name_entry_size;
3916 PUT2(slot, 0, cd->bracount + 1);
3917 memcpy(slot + 2, name, namelen);
3918 slot[2+namelen] = 0;
3922 /* In both cases, count the number of names we've encountered. */
3924 ptr++; /* Move past > or ' */
3926 goto NUMBERED_GROUP;
3929 /* ------------------------------------------------------------ */
3930 case '&': /* Perl recursion/subroutine syntax */
3935 /* We come here from the Python syntax above that handles both
3936 references (?P=name) and recursion (?P>name), as well as falling
3937 through from the Perl recursion syntax (?&name). */
3939 NAMED_REF_OR_RECURSE:
3941 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
3942 namelen = ptr - name;
3944 /* In the pre-compile phase, do a syntax check and set a dummy
3945 reference number. */
3947 if (lengthptr != NULL)
3949 if (*ptr != terminator)
3951 *errorcodeptr = ERR42;
3954 if (namelen > MAX_NAME_SIZE)
3956 *errorcodeptr = ERR48;
3962 /* In the real compile, seek the name in the table */
3966 slot = cd->name_table;
3967 for (i = 0; i < cd->names_found; i++)
3969 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3970 slot += cd->name_entry_size;
3973 if (i < cd->names_found) /* Back reference */
3975 recno = GET2(slot, 0);
3977 else if ((recno = /* Forward back reference */
3978 find_parens(ptr, cd->bracount, name, namelen,
3979 (options & PCRE_EXTENDED) != 0)) <= 0)
3981 *errorcodeptr = ERR15;
3986 /* In both phases, we can now go to the code than handles numerical
3987 recursion or backreferences. */
3989 if (is_recurse) goto HANDLE_RECURSION;
3990 else goto HANDLE_REFERENCE;
3993 /* ------------------------------------------------------------ */
3994 case 'R': /* Recursion */
3995 ptr++; /* Same as (?0) */
3999 /* ------------------------------------------------------------ */
4000 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4001 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4003 const uschar *called;
4005 while((digitab[*ptr] & ctype_digit) != 0)
4006 recno = recno * 10 + *ptr++ - '0';
4009 *errorcodeptr = ERR29;
4013 /* Come here from code above that handles a named recursion */
4018 called = cd->start_code;
4020 /* When we are actually compiling, find the bracket that is being
4021 referenced. Temporarily end the regex in case it doesn't exist before
4022 this point. If we end up with a forward reference, first check that
4023 the bracket does occur later so we can give the error (and position)
4024 now. Then remember this forward reference in the workspace so it can
4025 be filled in at the end. */
4027 if (lengthptr == NULL)
4030 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4032 /* Forward reference */
4036 if (find_parens(ptr, cd->bracount, NULL, recno,
4037 (options & PCRE_EXTENDED) != 0) < 0)
4039 *errorcodeptr = ERR15;
4042 called = cd->start_code + recno;
4043 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4046 /* If not a forward reference, and the subpattern is still open,
4047 this is a recursive call. We check to see if this is a left
4048 recursion that could loop for ever, and diagnose that case. */
4050 else if (GET(called, 1) == 0 &&
4051 could_be_empty(called, code, bcptr, utf8))
4053 *errorcodeptr = ERR40;
4058 /* Insert the recursion/subroutine item, automatically wrapped inside
4059 "once" brackets. Set up a "previous group" length so that a
4060 subsequent quantifier will work. */
4063 PUT(code, 1, 2 + 2*LINK_SIZE);
4064 code += 1 + LINK_SIZE;
4067 PUT(code, 1, called - cd->start_code);
4068 code += 1 + LINK_SIZE;
4071 PUT(code, 1, 2 + 2*LINK_SIZE);
4072 code += 1 + LINK_SIZE;
4074 length_prevgroup = 3 + 3*LINK_SIZE;
4077 /* Can't determine a first byte now */
4079 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4083 /* ------------------------------------------------------------ */
4084 default: /* Other characters: check option setting */
4088 while (*ptr != ')' && *ptr != ':')
4092 case '-': optset = &unset; break;
4094 case 'J': /* Record that it changed in the external options */
4095 *optset |= PCRE_DUPNAMES;
4096 cd->external_options |= PCRE_JCHANGED;
4099 case 'i': *optset |= PCRE_CASELESS; break;
4100 case 'm': *optset |= PCRE_MULTILINE; break;
4101 case 's': *optset |= PCRE_DOTALL; break;
4102 case 'x': *optset |= PCRE_EXTENDED; break;
4103 case 'U': *optset |= PCRE_UNGREEDY; break;
4104 case 'X': *optset |= PCRE_EXTRA; break;
4106 default: *errorcodeptr = ERR12;
4107 ptr--; /* Correct the offset */
4112 /* Set up the changed option bits, but don't change anything yet. */
4114 newoptions = (options | set) & (~unset);
4116 /* If the options ended with ')' this is not the start of a nested
4117 group with option changes, so the options change at this level. If this
4118 item is right at the start of the pattern, the options can be
4119 abstracted and made external in the pre-compile phase, and ignored in
4120 the compile phase. This can be helpful when matching -- for instance in
4121 caseless checking of required bytes.
4123 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4124 definitely *not* at the start of the pattern because something has been
4125 compiled. In the pre-compile phase, however, the code pointer can have
4126 that value after the start, because it gets reset as code is discarded
4127 during the pre-compile. However, this can happen only at top level - if
4128 we are within parentheses, the starting BRA will still be present. At
4129 any parenthesis level, the length value can be used to test if anything
4130 has been compiled at that level. Thus, a test for both these conditions
4131 is necessary to ensure we correctly detect the start of the pattern in
4134 If we are not at the pattern start, compile code to change the ims
4135 options if this setting actually changes any of them. We also pass the
4136 new setting back so that it can be put at the start of any following
4137 branches, and when this group ends (if we are in a group), a resetting
4138 item can be compiled. */
4142 if (code == cd->start_code + 1 + LINK_SIZE &&
4143 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4145 cd->external_options = newoptions;
4146 options = newoptions;
4150 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4153 *code++ = newoptions & PCRE_IMS;
4156 /* Change options at this level, and pass them back for use
4157 in subsequent branches. Reset the greedy defaults and the case
4158 value for firstbyte and reqbyte. */
4160 *optionsptr = options = newoptions;
4161 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4162 greedy_non_default = greedy_default ^ 1;
4163 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4166 previous = NULL; /* This item can't be repeated */
4167 continue; /* It is complete */
4170 /* If the options ended with ':' we are heading into a nested group
4171 with possible change of options. Such groups are non-capturing and are
4172 not assertions of any kind. All we need to do is skip over the ':';
4173 the newoptions value is handled below. */
4177 } /* End of switch for character following (? */
4178 } /* End of (? handling */
4180 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4181 all unadorned brackets become non-capturing and behave like (?:...)
4184 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4189 /* Else we have a capturing group. */
4195 PUT2(code, 1+LINK_SIZE, cd->bracount);
4199 /* Process nested bracketed regex. Assertions may not be repeated, but
4200 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4201 non-register variable in order to be able to pass its address because some
4202 compilers complain otherwise. Pass in a new setting for the ims options if
4203 they have changed. */
4205 previous = (bravalue >= OP_ONCE)? code : NULL;
4208 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4209 length_prevgroup = 0; /* Initialize for pre-compile phase */
4212 newoptions, /* The complete new option state */
4213 options & PCRE_IMS, /* The previous ims option state */
4214 &tempcode, /* Where to put code (updated) */
4215 &ptr, /* Input pointer (updated) */
4216 errorcodeptr, /* Where to put an error message */
4217 (bravalue == OP_ASSERTBACK ||
4218 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4219 skipbytes, /* Skip over bracket number */
4220 &subfirstbyte, /* For possible first char */
4221 &subreqbyte, /* For possible last char */
4222 bcptr, /* Current branch chain */
4223 cd, /* Tables block */
4224 (lengthptr == NULL)? NULL : /* Actual compile phase */
4225 &length_prevgroup /* Pre-compile phase */
4229 /* At the end of compiling, code is still pointing to the start of the
4230 group, while tempcode has been updated to point past the end of the group
4231 and any option resetting that may follow it. The pattern pointer (ptr)
4232 is on the bracket. */
4234 /* If this is a conditional bracket, check that there are no more than
4235 two branches in the group, or just one if it's a DEFINE group. */
4237 if (bravalue == OP_COND)
4246 while (*tc != OP_KET);
4248 /* A DEFINE group is never obeyed inline (the "condition" is always
4249 false). It must have only one branch. */
4251 if (code[LINK_SIZE+1] == OP_DEF)
4255 *errorcodeptr = ERR54;
4258 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4261 /* A "normal" conditional group. If there is just one branch, we must not
4262 make use of its firstbyte or reqbyte, because this is equivalent to an
4263 empty second branch. */
4269 *errorcodeptr = ERR27;
4272 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4276 /* Error if hit end of pattern */
4280 *errorcodeptr = ERR14;
4284 /* In the pre-compile phase, update the length by the length of the nested
4285 group, less the brackets at either end. Then reduce the compiled code to
4286 just the brackets so that it doesn't use much memory if it is duplicated by
4289 if (lengthptr != NULL)
4291 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4293 PUTINC(code, 0, 1 + LINK_SIZE);
4295 PUTINC(code, 0, 1 + LINK_SIZE);
4298 /* Otherwise update the main code pointer to the end of the group. */
4300 else code = tempcode;
4302 /* For a DEFINE group, required and first character settings are not
4305 if (bravalue == OP_DEF) break;
4307 /* Handle updating of the required and first characters for other types of
4308 group. Update for normal brackets of all kinds, and conditions with two
4309 branches (see code above). If the bracket is followed by a quantifier with
4310 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4311 zerofirstbyte outside the main loop so that they can be accessed for the
4314 zeroreqbyte = reqbyte;
4315 zerofirstbyte = firstbyte;
4316 groupsetfirstbyte = FALSE;
4318 if (bravalue >= OP_ONCE)
4320 /* If we have not yet set a firstbyte in this branch, take it from the
4321 subpattern, remembering that it was set here so that a repeat of more
4322 than one can replicate it as reqbyte if necessary. If the subpattern has
4323 no firstbyte, set "none" for the whole branch. In both cases, a zero
4324 repeat forces firstbyte to "none". */
4326 if (firstbyte == REQ_UNSET)
4328 if (subfirstbyte >= 0)
4330 firstbyte = subfirstbyte;
4331 groupsetfirstbyte = TRUE;
4333 else firstbyte = REQ_NONE;
4334 zerofirstbyte = REQ_NONE;
4337 /* If firstbyte was previously set, convert the subpattern's firstbyte
4338 into reqbyte if there wasn't one, using the vary flag that was in
4339 existence beforehand. */
4341 else if (subfirstbyte >= 0 && subreqbyte < 0)
4342 subreqbyte = subfirstbyte | tempreqvary;
4344 /* If the subpattern set a required byte (or set a first byte that isn't
4345 really the first byte - see above), set it. */
4347 if (subreqbyte >= 0) reqbyte = subreqbyte;
4350 /* For a forward assertion, we take the reqbyte, if set. This can be
4351 helpful if the pattern that follows the assertion doesn't set a different
4352 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4353 for an assertion, however because it leads to incorrect effect for patterns
4354 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4355 of a firstbyte. This is overcome by a scan at the end if there's no
4356 firstbyte, looking for an asserted first char. */
4358 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4359 break; /* End of processing '(' */
4362 /* ===================================================================*/
4363 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4364 are arranged to be the negation of the corresponding OP_values. For the
4365 back references, the values are ESC_REF plus the reference number. Only
4366 back references and those types that consume a character may be repeated.
4367 We can test for values between ESC_b and ESC_Z for the latter; this may
4368 have to change if any new ones are ever created. */
4372 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4373 if (*errorcodeptr != 0) goto FAILED;
4377 if (-c == ESC_Q) /* Handle start of quoted string */
4379 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
4384 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
4386 /* For metasequences that actually match a character, we disable the
4387 setting of a first character if it hasn't already been set. */
4389 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
4390 firstbyte = REQ_NONE;
4392 /* Set values to reset to if this is followed by a zero repeat. */
4394 zerofirstbyte = firstbyte;
4395 zeroreqbyte = reqbyte;
4397 /* \k<name> or \k'name' is a back reference by name (Perl syntax) */
4399 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\''))
4402 terminator = (*(++ptr) == '<')? '>' : '\'';
4403 goto NAMED_REF_OR_RECURSE;
4406 /* Back references are handled specially; must disable firstbyte if
4407 not set to cope with cases like (?=(\w+))\1: which would otherwise set
4412 recno = -c - ESC_REF;
4414 HANDLE_REFERENCE: /* Come here from named backref handling */
4415 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4418 PUT2INC(code, 0, recno);
4419 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
4420 if (recno > cd->top_backref) cd->top_backref = recno;
4423 /* So are Unicode property matches, if supported. */
4426 else if (-c == ESC_P || -c == ESC_p)
4430 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4431 if (ptype < 0) goto FAILED;
4433 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
4439 /* If Unicode properties are not supported, \X, \P, and \p are not
4442 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
4444 *errorcodeptr = ERR45;
4449 /* For the rest (including \X when Unicode properties are supported), we
4450 can obtain the OP value by negating the escape value. */
4454 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
4460 /* We have a data character whose value is in c. In UTF-8 mode it may have
4461 a value > 127. We set its representation in the length/buffer, and then
4462 handle it as a data character. */
4465 if (utf8 && c > 127)
4466 mclength = _pcre_ord2utf8(c, mcbuffer);
4477 /* ===================================================================*/
4478 /* Handle a literal character. It is guaranteed not to be whitespace or #
4479 when the extended flag is set. If we are in UTF-8 mode, it may be a
4480 multi-byte literal character. */
4488 if (utf8 && c >= 0xc0)
4490 while ((ptr[1] & 0xc0) == 0x80)
4491 mcbuffer[mclength++] = *(++ptr);
4495 /* At this point we have the character's bytes in mcbuffer, and the length
4496 in mclength. When not in UTF-8 mode, the length is always 1. */
4500 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
4501 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
4503 /* Set the first and required bytes appropriately. If no previous first
4504 byte, set it from this character, but revert to none on a zero repeat.
4505 Otherwise, leave the firstbyte value alone, and don't change it on a zero
4508 if (firstbyte == REQ_UNSET)
4510 zerofirstbyte = REQ_NONE;
4511 zeroreqbyte = reqbyte;
4513 /* If the character is more than one byte long, we can set firstbyte
4514 only if it is not to be matched caselessly. */
4516 if (mclength == 1 || req_caseopt == 0)
4518 firstbyte = mcbuffer[0] | req_caseopt;
4519 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
4521 else firstbyte = reqbyte = REQ_NONE;
4524 /* firstbyte was previously set; we can set reqbyte only the length is
4525 1 or the matching is caseful. */
4529 zerofirstbyte = firstbyte;
4530 zeroreqbyte = reqbyte;
4531 if (mclength == 1 || req_caseopt == 0)
4532 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
4535 break; /* End of literal character handling */
4537 } /* end of big loop */
4540 /* Control never reaches here by falling through, only by a goto for all the
4541 error states. Pass back the position in the pattern so that it can be displayed
4542 to the user for diagnosing the error. */
4552 /*************************************************
4553 * Compile sequence of alternatives *
4554 *************************************************/
4556 /* On entry, ptr is pointing past the bracket character, but on return it
4557 points to the closing bracket, or vertical bar, or end of string. The code
4558 variable is pointing at the byte into which the BRA operator has been stored.
4559 If the ims options are changed at the start (for a (?ims: group) or during any
4560 branch, we need to insert an OP_OPT item at the start of every following branch
4561 to ensure they get set correctly at run time, and also pass the new options
4562 into every subsequent branch compile.
4564 This function is used during the pre-compile phase when we are trying to find
4565 out the amount of memory needed, as well as during the real compile phase. The
4566 value of lengthptr distinguishes the two phases.
4569 options option bits, including any changes for this subpattern
4570 oldims previous settings of ims option bits
4571 codeptr -> the address of the current code pointer
4572 ptrptr -> the address of the current pattern pointer
4573 errorcodeptr -> pointer to error code variable
4574 lookbehind TRUE if this is a lookbehind assertion
4575 skipbytes skip this many bytes at start (for brackets and OP_COND)
4576 firstbyteptr place to put the first required character, or a negative number
4577 reqbyteptr place to put the last required character, or a negative number
4578 bcptr pointer to the chain of currently open branches
4579 cd points to the data block with tables pointers etc.
4580 lengthptr NULL during the real compile phase
4581 points to length accumulator during pre-compile phase
4583 Returns: TRUE on success
4587 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
4588 int *errorcodeptr, BOOL lookbehind, int skipbytes, int *firstbyteptr,
4589 int *reqbyteptr, branch_chain *bcptr, compile_data *cd, int *lengthptr)
4591 const uschar *ptr = *ptrptr;
4592 uschar *code = *codeptr;
4593 uschar *last_branch = code;
4594 uschar *start_bracket = code;
4595 uschar *reverse_count = NULL;
4596 int firstbyte, reqbyte;
4597 int branchfirstbyte, branchreqbyte;
4604 firstbyte = reqbyte = REQ_UNSET;
4606 /* Accumulate the length for use in the pre-compile phase. Start with the
4607 length of the BRA and KET and any extra bytes that are required at the
4608 beginning. We accumulate in a local variable to save frequent testing of
4609 lenthptr for NULL. We cannot do this by looking at the value of code at the
4610 start and end of each alternative, because compiled items are discarded during
4611 the pre-compile phase so that the work space is not exceeded. */
4613 length = 2 + 2*LINK_SIZE + skipbytes;
4615 /* WARNING: If the above line is changed for any reason, you must also change
4616 the code that abstracts option settings at the start of the pattern and makes
4617 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
4618 pre-compile phase to find out whether anything has yet been compiled or not. */
4620 /* Offset is set zero to mark that this bracket is still open */
4623 code += 1 + LINK_SIZE + skipbytes;
4625 /* Loop for each alternative branch */
4629 /* Handle a change of ims options at the start of the branch */
4631 if ((options & PCRE_IMS) != oldims)
4634 *code++ = options & PCRE_IMS;
4638 /* Set up dummy OP_REVERSE if lookbehind assertion */
4642 *code++ = OP_REVERSE;
4643 reverse_count = code;
4645 length += 1 + LINK_SIZE;
4648 /* Now compile the branch; in the pre-compile phase its length gets added
4651 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
4652 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
4658 /* In the real compile phase, there is some post-processing to be done. */
4660 if (lengthptr == NULL)
4662 /* If this is the first branch, the firstbyte and reqbyte values for the
4663 branch become the values for the regex. */
4665 if (*last_branch != OP_ALT)
4667 firstbyte = branchfirstbyte;
4668 reqbyte = branchreqbyte;
4671 /* If this is not the first branch, the first char and reqbyte have to
4672 match the values from all the previous branches, except that if the
4673 previous value for reqbyte didn't have REQ_VARY set, it can still match,
4674 and we set REQ_VARY for the regex. */
4678 /* If we previously had a firstbyte, but it doesn't match the new branch,
4679 we have to abandon the firstbyte for the regex, but if there was
4680 previously no reqbyte, it takes on the value of the old firstbyte. */
4682 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4684 if (reqbyte < 0) reqbyte = firstbyte;
4685 firstbyte = REQ_NONE;
4688 /* If we (now or from before) have no firstbyte, a firstbyte from the
4689 branch becomes a reqbyte if there isn't a branch reqbyte. */
4691 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4692 branchreqbyte = branchfirstbyte;
4694 /* Now ensure that the reqbytes match */
4696 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4698 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4701 /* If lookbehind, check that this branch matches a fixed-length string, and
4702 put the length into the OP_REVERSE item. Temporarily mark the end of the
4703 branch with OP_END. */
4709 fixed_length = find_fixedlength(last_branch, options);
4710 DPRINTF(("fixed length = %d\n", fixed_length));
4711 if (fixed_length < 0)
4713 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
4717 PUT(reverse_count, 0, fixed_length);
4721 /* Reached end of expression, either ')' or end of pattern. Go back through
4722 the alternative branches and reverse the chain of offsets, with the field in
4723 the BRA item now becoming an offset to the first alternative. If there are
4724 no alternatives, it points to the end of the group. The length in the
4725 terminating ket is always the length of the whole bracketed item. If any of
4726 the ims options were changed inside the group, compile a resetting op-code
4727 following, except at the very end of the pattern. Return leaving the pointer
4728 at the terminating char. */
4732 int branch_length = code - last_branch;
4735 int prev_length = GET(last_branch, 1);
4736 PUT(last_branch, 1, branch_length);
4737 branch_length = prev_length;
4738 last_branch -= branch_length;
4740 while (branch_length > 0);
4742 /* Fill in the ket */
4745 PUT(code, 1, code - start_bracket);
4746 code += 1 + LINK_SIZE;
4748 /* Resetting option if needed */
4750 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4757 /* Set values to pass back */
4761 *firstbyteptr = firstbyte;
4762 *reqbyteptr = reqbyte;
4763 if (lengthptr != NULL) *lengthptr += length;
4767 /* Another branch follows; insert an "or" node. Its length field points back
4768 to the previous branch while the bracket remains open. At the end the chain
4769 is reversed. It's done like this so that the start of the bracket has a
4770 zero offset until it is closed, making it possible to detect recursion. */
4773 PUT(code, 1, code - last_branch);
4774 bc.current = last_branch = code;
4775 code += 1 + LINK_SIZE;
4777 length += 1 + LINK_SIZE;
4779 /* Control never reaches here */
4785 /*************************************************
4786 * Check for anchored expression *
4787 *************************************************/
4789 /* Try to find out if this is an anchored regular expression. Consider each
4790 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4791 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4792 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4793 counts, since OP_CIRC can match in the middle.
4795 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4796 This is the code for \G, which means "match at start of match position, taking
4797 into account the match offset".
4799 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4800 because that will try the rest of the pattern at all possible matching points,
4801 so there is no point trying again.... er ....
4803 .... except when the .* appears inside capturing parentheses, and there is a
4804 subsequent back reference to those parentheses. We haven't enough information
4805 to catch that case precisely.
4807 At first, the best we could do was to detect when .* was in capturing brackets
4808 and the highest back reference was greater than or equal to that level.
4809 However, by keeping a bitmap of the first 31 back references, we can catch some
4810 of the more common cases more precisely.
4813 code points to start of expression (the bracket)
4814 options points to the options setting
4815 bracket_map a bitmap of which brackets we are inside while testing; this
4816 handles up to substring 31; after that we just have to take
4817 the less precise approach
4818 backref_map the back reference bitmap
4820 Returns: TRUE or FALSE
4824 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4825 unsigned int backref_map)
4828 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4829 options, PCRE_MULTILINE, FALSE);
4830 register int op = *scode;
4832 /* Non-capturing brackets */
4836 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4839 /* Capturing brackets */
4841 else if (op == OP_CBRA)
4843 int n = GET2(scode, 1+LINK_SIZE);
4844 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4845 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4848 /* Other brackets */
4850 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4852 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4855 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4856 are or may be referenced. */
4858 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
4859 op == OP_TYPEPOSSTAR) &&
4860 (*options & PCRE_DOTALL) != 0)
4862 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4865 /* Check for explicit anchoring */
4867 else if (op != OP_SOD && op != OP_SOM &&
4868 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4870 code += GET(code, 1);
4872 while (*code == OP_ALT); /* Loop for each alternative */
4878 /*************************************************
4879 * Check for starting with ^ or .* *
4880 *************************************************/
4882 /* This is called to find out if every branch starts with ^ or .* so that
4883 "first char" processing can be done to speed things up in multiline
4884 matching and for non-DOTALL patterns that start with .* (which must start at
4885 the beginning or after \n). As in the case of is_anchored() (see above), we
4886 have to take account of back references to capturing brackets that contain .*
4887 because in that case we can't make the assumption.
4890 code points to start of expression (the bracket)
4891 bracket_map a bitmap of which brackets we are inside while testing; this
4892 handles up to substring 31; after that we just have to take
4893 the less precise approach
4894 backref_map the back reference bitmap
4896 Returns: TRUE or FALSE
4900 is_startline(const uschar *code, unsigned int bracket_map,
4901 unsigned int backref_map)
4904 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
4906 register int op = *scode;
4908 /* Non-capturing brackets */
4912 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
4915 /* Capturing brackets */
4917 else if (op == OP_CBRA)
4919 int n = GET2(scode, 1+LINK_SIZE);
4920 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
4921 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4924 /* Other brackets */
4926 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4927 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4929 /* .* means "start at start or after \n" if it isn't in brackets that
4930 may be referenced. */
4932 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
4934 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4937 /* Check for explicit circumflex */
4939 else if (op != OP_CIRC) return FALSE;
4941 /* Move on to the next alternative */
4943 code += GET(code, 1);
4945 while (*code == OP_ALT); /* Loop for each alternative */
4951 /*************************************************
4952 * Check for asserted fixed first char *
4953 *************************************************/
4955 /* During compilation, the "first char" settings from forward assertions are
4956 discarded, because they can cause conflicts with actual literals that follow.
4957 However, if we end up without a first char setting for an unanchored pattern,
4958 it is worth scanning the regex to see if there is an initial asserted first
4959 char. If all branches start with the same asserted char, or with a bracket all
4960 of whose alternatives start with the same asserted char (recurse ad lib), then
4961 we return that char, otherwise -1.
4964 code points to start of expression (the bracket)
4965 options pointer to the options (used to check casing changes)
4966 inassert TRUE if in an assertion
4968 Returns: -1 or the fixed first char
4972 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4974 register int c = -1;
4977 const uschar *scode =
4978 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4979 register int op = *scode;
4991 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4993 if (c < 0) c = d; else if (c != d) return -1;
4996 case OP_EXACT: /* Fall through */
5004 if (!inassert) return -1;
5008 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5010 else if (c != scode[1]) return -1;
5014 code += GET(code, 1);
5016 while (*code == OP_ALT);
5022 /*************************************************
5023 * Compile a Regular Expression *
5024 *************************************************/
5026 /* This function takes a string and returns a pointer to a block of store
5027 holding a compiled version of the expression. The original API for this
5028 function had no error code return variable; it is retained for backwards
5029 compatibility. The new function is given a new name.
5032 pattern the regular expression
5033 options various option bits
5034 errorcodeptr pointer to error code variable (pcre_compile2() only)
5035 can be NULL if you don't want a code value
5036 errorptr pointer to pointer to error text
5037 erroroffset ptr offset in pattern where error was detected
5038 tables pointer to character tables or NULL
5040 Returns: pointer to compiled data block, or NULL on error,
5041 with errorptr and erroroffset set
5044 PCRE_DATA_SCOPE pcre *
5045 pcre_compile(const char *pattern, int options, const char **errorptr,
5046 int *erroroffset, const unsigned char *tables)
5048 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5052 PCRE_DATA_SCOPE pcre *
5053 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5054 const char **errorptr, int *erroroffset, const unsigned char *tables)
5057 int length = 1; /* For final END opcode */
5058 int firstbyte, reqbyte, newline;
5065 const uschar *codestart;
5067 compile_data compile_block;
5068 compile_data *cd = &compile_block;
5070 /* This space is used for "compiling" into during the first phase, when we are
5071 computing the amount of memory that is needed. Compiled items are thrown away
5072 as soon as possible, so that a fairly large buffer should be sufficient for
5073 this purpose. The same space is used in the second phase for remembering where
5074 to fill in forward references to subpatterns. */
5076 uschar cworkspace[COMPILE_WORK_SIZE];
5079 /* Set this early so that early errors get offset 0. */
5081 ptr = (const uschar *)pattern;
5083 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5084 can do is just return NULL, but we can set a code value if there is a code
5087 if (errorptr == NULL)
5089 if (errorcodeptr != NULL) *errorcodeptr = 99;
5094 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5096 /* However, we can give a message for this error */
5098 if (erroroffset == NULL)
5101 goto PCRE_EARLY_ERROR_RETURN;
5106 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5109 utf8 = (options & PCRE_UTF8) != 0;
5110 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5111 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5114 goto PCRE_UTF8_ERROR_RETURN;
5117 if ((options & PCRE_UTF8) != 0)
5120 goto PCRE_EARLY_ERROR_RETURN;
5124 if ((options & ~PUBLIC_OPTIONS) != 0)
5127 goto PCRE_EARLY_ERROR_RETURN;
5130 /* Set up pointers to the individual character tables */
5132 if (tables == NULL) tables = _pcre_default_tables;
5133 cd->lcc = tables + lcc_offset;
5134 cd->fcc = tables + fcc_offset;
5135 cd->cbits = tables + cbits_offset;
5136 cd->ctypes = tables + ctypes_offset;
5138 /* Handle different types of newline. The three bits give seven cases. The
5139 current code allows for fixed one- or two-byte sequences, plus "any". */
5141 switch (options & (PCRE_NEWLINE_CRLF | PCRE_NEWLINE_ANY))
5143 case 0: newline = NEWLINE; break; /* Compile-time default */
5144 case PCRE_NEWLINE_CR: newline = '\r'; break;
5145 case PCRE_NEWLINE_LF: newline = '\n'; break;
5146 case PCRE_NEWLINE_CR+
5147 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5148 case PCRE_NEWLINE_ANY: newline = -1; break;
5149 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5154 cd->nltype = NLTYPE_ANY;
5158 cd->nltype = NLTYPE_FIXED;
5162 cd->nl[0] = (newline >> 8) & 255;
5163 cd->nl[1] = newline & 255;
5168 cd->nl[0] = newline;
5172 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5173 references to help in deciding whether (.*) can be treated as anchored or not.
5176 cd->top_backref = 0;
5177 cd->backref_map = 0;
5179 /* Reflect pattern for debugging output */
5181 DPRINTF(("------------------------------------------------------------------\n"));
5182 DPRINTF(("%s\n", pattern));
5184 /* Pretend to compile the pattern while actually just accumulating the length
5185 of memory required. This behaviour is triggered by passing a non-NULL final
5186 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5187 to compile parts of the pattern into; the compiled code is discarded when it is
5188 no longer needed, so hopefully this workspace will never overflow, though there
5189 is a test for its doing so. */
5192 cd->names_found = 0;
5193 cd->name_entry_size = 0;
5194 cd->name_table = NULL;
5195 cd->start_workspace = cworkspace;
5196 cd->start_code = cworkspace;
5197 cd->hwm = cworkspace;
5198 cd->start_pattern = (const uschar *)pattern;
5199 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5200 cd->req_varyopt = 0;
5201 cd->nopartial = FALSE;
5202 cd->external_options = options;
5204 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5205 don't need to look at the result of the function here. The initial options have
5206 been put into the cd block so that they can be changed if an option setting is
5207 found within the regex right at the beginning. Bringing initial option settings
5208 outside can help speed up starting point checks. */
5212 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5213 &code, &ptr, &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, &length);
5214 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5216 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5217 cd->hwm - cworkspace));
5219 if (length > MAX_PATTERN_SIZE)
5222 goto PCRE_EARLY_ERROR_RETURN;
5225 /* Compute the size of data block needed and get it, either from malloc or
5226 externally provided function. Integer overflow should no longer be possible
5227 because nowadays we limit the maximum value of cd->names_found and
5228 cd->name_entry_size. */
5230 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5231 re = (real_pcre *)(pcre_malloc)(size);
5236 goto PCRE_EARLY_ERROR_RETURN;
5239 /* Put in the magic number, and save the sizes, initial options, and character
5240 table pointer. NULL is used for the default character tables. The nullpad field
5241 is at the end; it's there to help in the case when a regex compiled on a system
5242 with 4-byte pointers is run on another with 8-byte pointers. */
5244 re->magic_number = MAGIC_NUMBER;
5246 re->options = cd->external_options;
5250 re->name_table_offset = sizeof(real_pcre);
5251 re->name_entry_size = cd->name_entry_size;
5252 re->name_count = cd->names_found;
5254 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5257 /* The starting points of the name/number translation table and of the code are
5258 passed around in the compile data block. The start/end pattern and initial
5259 options are already set from the pre-compile phase, as is the name_entry_size
5260 field. Reset the bracket count and the names_found field. Also reset the hwm
5261 field; this time it's used for remembering forward references to subpatterns.
5265 cd->names_found = 0;
5266 cd->name_table = (uschar *)re + re->name_table_offset;
5267 codestart = cd->name_table + re->name_entry_size * re->name_count;
5268 cd->start_code = codestart;
5269 cd->hwm = cworkspace;
5270 cd->req_varyopt = 0;
5271 cd->nopartial = FALSE;
5273 /* Set up a starting, non-extracting bracket, then compile the expression. On
5274 error, errorcode will be set non-zero, so we don't need to look at the result
5275 of the function here. */
5277 ptr = (const uschar *)pattern;
5278 code = (uschar *)codestart;
5280 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
5281 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
5282 re->top_bracket = cd->bracount;
5283 re->top_backref = cd->top_backref;
5285 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5287 /* If not reached end of pattern on success, there's an excess bracket. */
5289 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5291 /* Fill in the terminating state and check for disastrous overflow, but
5292 if debugging, leave the test till after things are printed out. */
5297 if (code - codestart > length) errorcode = ERR23;
5300 /* Fill in any forward references that are required. */
5302 while (errorcode == 0 && cd->hwm > cworkspace)
5305 const uschar *groupptr;
5306 cd->hwm -= LINK_SIZE;
5307 offset = GET(cd->hwm, 0);
5308 recno = GET(codestart, offset);
5309 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
5310 if (groupptr == NULL) errorcode = ERR53;
5311 else PUT(((uschar *)codestart), offset, groupptr - codestart);
5314 /* Give an error if there's back reference to a non-existent capturing
5317 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
5319 /* Failed to compile, or error while post-processing */
5324 PCRE_EARLY_ERROR_RETURN:
5325 *erroroffset = ptr - (const uschar *)pattern;
5327 PCRE_UTF8_ERROR_RETURN:
5329 *errorptr = error_texts[errorcode];
5330 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5334 /* If the anchored option was not passed, set the flag if we can determine that
5335 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5336 as starting with .* when DOTALL is set).
5338 Otherwise, if we know what the first byte has to be, save it, because that
5339 speeds up unanchored matches no end. If not, see if we can set the
5340 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5341 start with ^. and also when all branches start with .* for non-DOTALL matches.
5344 if ((re->options & PCRE_ANCHORED) == 0)
5346 int temp_options = re->options; /* May get changed during these scans */
5347 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5348 re->options |= PCRE_ANCHORED;
5352 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5353 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5355 int ch = firstbyte & 255;
5356 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5357 cd->fcc[ch] == ch)? ch : firstbyte;
5358 re->options |= PCRE_FIRSTSET;
5360 else if (is_startline(codestart, 0, cd->backref_map))
5361 re->options |= PCRE_STARTLINE;
5365 /* For an anchored pattern, we use the "required byte" only if it follows a
5366 variable length item in the regex. Remove the caseless flag for non-caseable
5370 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5372 int ch = reqbyte & 255;
5373 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5374 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5375 re->options |= PCRE_REQCHSET;
5378 /* Print out the compiled data if debugging is enabled. This is never the
5379 case when building a production library. */
5383 printf("Length = %d top_bracket = %d top_backref = %d\n",
5384 length, re->top_bracket, re->top_backref);
5386 if (re->options != 0)
5388 printf("%s%s%s%s%s%s%s%s%s\n",
5389 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5390 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5391 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5392 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5393 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5394 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5395 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5396 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5397 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5400 if ((re->options & PCRE_FIRSTSET) != 0)
5402 int ch = re->first_byte & 255;
5403 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5405 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5406 else printf("First char = \\x%02x%s\n", ch, caseless);
5409 if ((re->options & PCRE_REQCHSET) != 0)
5411 int ch = re->req_byte & 255;
5412 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5414 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5415 else printf("Req char = \\x%02x%s\n", ch, caseless);
5418 pcre_printint(re, stdout);
5420 /* This check is done here in the debugging case so that the code that
5421 was compiled can be seen. */
5423 if (code - codestart > length)
5426 *errorptr = error_texts[ERR23];
5427 *erroroffset = ptr - (uschar *)pattern;
5428 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5436 /* End of pcre_compile.c */