1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.3 2006/11/07 16:50:36 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2006 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
47 #define NLBLOCK cd /* The block containing newline information */
48 #include "pcre_internal.h"
51 /* When DEBUG is defined, we need the pcre_printint() function, which is also
52 used by pcretest. DEBUG is not defined when building a production library. */
55 #include "pcre_printint.src"
60 /*************************************************
61 * Code parameters and static tables *
62 *************************************************/
64 /* Maximum number of items on the nested bracket stacks at compile time. This
65 applies to the nesting of all kinds of parentheses. It does not limit
66 un-nested, non-capturing parentheses. This number can be made bigger if
67 necessary - it is used to dimension one int and one unsigned char vector at
70 #define BRASTACK_SIZE 200
73 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
74 are simple data values; negative values are for special things like \d and so
75 on. Zero means further processing is needed (for things like \x), or the escape
78 #if !EBCDIC /* This is the "normal" table for ASCII systems */
79 static const short int escapes[] = {
80 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
81 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
82 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
83 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
84 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
85 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
86 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
87 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
88 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
89 0, 0, -ESC_z /* x - z */
92 #else /* This is the "abnormal" table for EBCDIC systems */
93 static const short int escapes[] = {
94 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
95 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
96 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
97 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
98 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
99 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
100 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
101 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
102 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
103 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
104 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
105 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
106 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
107 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
108 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
109 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
110 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
111 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
112 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
113 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
114 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
115 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
116 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
121 /* Tables of names of POSIX character classes and their lengths. The list is
122 terminated by a zero length entry. The first three must be alpha, lower, upper,
123 as this is assumed for handling case independence. */
125 static const char *const posix_names[] = {
126 "alpha", "lower", "upper",
127 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
128 "print", "punct", "space", "word", "xdigit" };
130 static const uschar posix_name_lengths[] = {
131 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
133 /* Table of class bit maps for each POSIX class. Each class is formed from a
134 base map, with an optional addition or removal of another map. Then, for some
135 classes, there is some additional tweaking: for [:blank:] the vertical space
136 characters are removed, and for [:alpha:] and [:alnum:] the underscore
137 character is removed. The triples in the table consist of the base map offset,
138 second map offset or -1 if no second map, and a non-negative value for map
139 addition or a negative value for map subtraction (if there are two maps). The
140 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
141 remove vertical space characters, 2 => remove underscore. */
143 static const int posix_class_maps[] = {
144 cbit_word, cbit_digit, -2, /* alpha */
145 cbit_lower, -1, 0, /* lower */
146 cbit_upper, -1, 0, /* upper */
147 cbit_word, -1, 2, /* alnum - word without underscore */
148 cbit_print, cbit_cntrl, 0, /* ascii */
149 cbit_space, -1, 1, /* blank - a GNU extension */
150 cbit_cntrl, -1, 0, /* cntrl */
151 cbit_digit, -1, 0, /* digit */
152 cbit_graph, -1, 0, /* graph */
153 cbit_print, -1, 0, /* print */
154 cbit_punct, -1, 0, /* punct */
155 cbit_space, -1, 0, /* space */
156 cbit_word, -1, 0, /* word - a Perl extension */
157 cbit_xdigit,-1, 0 /* xdigit */
161 /* The texts of compile-time error messages. These are "char *" because they
162 are passed to the outside world. */
164 static const char *error_texts[] = {
166 "\\ at end of pattern",
167 "\\c at end of pattern",
168 "unrecognized character follows \\",
169 "numbers out of order in {} quantifier",
171 "number too big in {} quantifier",
172 "missing terminating ] for character class",
173 "invalid escape sequence in character class",
174 "range out of order in character class",
177 "operand of unlimited repeat could match the empty string",
178 "internal error: unexpected repeat",
179 "unrecognized character after (?",
180 "POSIX named classes are supported only within a class",
183 "reference to non-existent subpattern",
184 "erroffset passed as NULL",
185 "unknown option bit(s) set",
186 "missing ) after comment",
187 "parentheses nested too deeply",
189 "regular expression too large",
190 "failed to get memory",
191 "unmatched parentheses",
192 "internal error: code overflow",
193 "unrecognized character after (?<",
195 "lookbehind assertion is not fixed length",
196 "malformed number or name after (?(",
197 "conditional group contains more than two branches",
198 "assertion expected after (?(",
199 "(?R or (?digits must be followed by )",
201 "unknown POSIX class name",
202 "POSIX collating elements are not supported",
203 "this version of PCRE is not compiled with PCRE_UTF8 support",
205 "character value in \\x{...} sequence is too large",
207 "invalid condition (?(0)",
208 "\\C not allowed in lookbehind assertion",
209 "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
210 "number after (?C is > 255",
211 "closing ) for (?C expected",
213 "recursive call could loop indefinitely",
214 "unrecognized character after (?P",
215 "syntax error after (?P",
216 "two named subpatterns have the same name",
217 "invalid UTF-8 string",
219 "support for \\P, \\p, and \\X has not been compiled",
220 "malformed \\P or \\p sequence",
221 "unknown property name after \\P or \\p",
222 "subpattern name is too long (maximum 32 characters)",
223 "too many named subpatterns (maximum 10,000)",
225 "repeated subpattern is too long",
226 "octal value is greater than \\377 (not in UTF-8 mode)"
230 /* Table to identify digits and hex digits. This is used when compiling
231 patterns. Note that the tables in chartables are dependent on the locale, and
232 may mark arbitrary characters as digits - but the PCRE compiling code expects
233 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
234 a private table here. It costs 256 bytes, but it is a lot faster than doing
235 character value tests (at least in some simple cases I timed), and in some
236 applications one wants PCRE to compile efficiently as well as match
239 For convenience, we use the same bit definitions as in chartables:
242 0x08 hexadecimal digit
244 Then we can use ctype_digit and ctype_xdigit in the code. */
246 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
247 static const unsigned char digitab[] =
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
255 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
256 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
257 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
261 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
270 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
271 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
272 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
273 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
274 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
276 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
282 #else /* This is the "abnormal" case, for EBCDIC systems */
283 static const unsigned char digitab[] =
285 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
290 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
291 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
292 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
293 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
294 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
295 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
296 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
297 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
298 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
299 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
300 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
301 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
302 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
303 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
304 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
305 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
306 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
307 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
308 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
309 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
310 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
311 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
312 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
313 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
314 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
315 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
316 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
318 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
319 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
320 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
321 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
322 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
323 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
324 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
325 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
326 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
327 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
328 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
330 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
332 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
333 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
334 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
335 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
336 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
337 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
338 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
339 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
340 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
341 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
342 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
343 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
344 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
345 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
346 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
347 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
348 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
349 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
350 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
354 /* Definition to allow mutual recursion */
357 compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
358 int *, int *, branch_chain *, compile_data *);
362 /*************************************************
364 *************************************************/
366 /* This function is called when a \ has been encountered. It either returns a
367 positive value for a simple escape such as \n, or a negative value which
368 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
369 a positive value greater than 255 may be returned. On entry, ptr is pointing at
370 the \. On exit, it is on the final character of the escape sequence.
373 ptrptr points to the pattern position pointer
374 errorcodeptr points to the errorcode variable
375 bracount number of previous extracting brackets
376 options the options bits
377 isclass TRUE if inside a character class
379 Returns: zero or positive => a data character
380 negative => a special escape sequence
381 on error, errorptr is set
385 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
386 int options, BOOL isclass)
388 BOOL utf8 = (options & PCRE_UTF8) != 0;
389 const uschar *ptr = *ptrptr + 1;
392 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
393 ptr--; /* Set pointer back to the last byte */
395 /* If backslash is at the end of the pattern, it's an error. */
397 if (c == 0) *errorcodeptr = ERR1;
399 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
400 a table. A non-zero result is something that can be returned immediately.
401 Otherwise further processing may be required. */
403 #if !EBCDIC /* ASCII coding */
404 else if (c < '0' || c > 'z') {} /* Not alphameric */
405 else if ((i = escapes[c - '0']) != 0) c = i;
407 #else /* EBCDIC coding */
408 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
409 else if ((i = escapes[c - 0x48]) != 0) c = i;
412 /* Escapes that need further processing, or are illegal. */
416 const uschar *oldptr;
419 /* A number of Perl escapes are not handled by PCRE. We give an explicit
427 *errorcodeptr = ERR37;
430 /* The handling of escape sequences consisting of a string of digits
431 starting with one that is not zero is not straightforward. By experiment,
432 the way Perl works seems to be as follows:
434 Outside a character class, the digits are read as a decimal number. If the
435 number is less than 10, or if there are that many previous extracting
436 left brackets, then it is a back reference. Otherwise, up to three octal
437 digits are read to form an escaped byte. Thus \123 is likely to be octal
438 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
439 value is greater than 377, the least significant 8 bits are taken. Inside a
440 character class, \ followed by a digit is always an octal number. */
442 case '1': case '2': case '3': case '4': case '5':
443 case '6': case '7': case '8': case '9':
449 while ((digitab[ptr[1]] & ctype_digit) != 0)
450 c = c * 10 + *(++ptr) - '0';
451 if (c < 10 || c <= bracount)
456 ptr = oldptr; /* Put the pointer back and fall through */
459 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
460 generates a binary zero byte and treats the digit as a following literal.
461 Thus we have to pull back the pointer by one. */
463 if ((c = *ptr) >= '8')
470 /* \0 always starts an octal number, but we may drop through to here with a
471 larger first octal digit. The original code used just to take the least
472 significant 8 bits of octal numbers (I think this is what early Perls used
473 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
474 than 3 octal digits. */
478 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
479 c = c * 8 + *(++ptr) - '0';
480 if (!utf8 && c > 255) *errorcodeptr = ERR51;
483 /* \x is complicated. \x{ddd} is a character number which can be greater
484 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
485 treated as a data character. */
490 const uschar *pt = ptr + 2;
494 while ((digitab[*pt] & ctype_xdigit) != 0)
496 register int cc = *pt++;
497 if (c == 0 && cc == '0') continue; /* Leading zeroes */
500 #if !EBCDIC /* ASCII coding */
501 if (cc >= 'a') cc -= 32; /* Convert to upper case */
502 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
503 #else /* EBCDIC coding */
504 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
505 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
511 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
516 /* If the sequence of hex digits does not end with '}', then we don't
517 recognize this construct; fall through to the normal \x handling. */
520 /* Read just a single-byte hex-defined char */
523 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
525 int cc; /* Some compilers don't like ++ */
526 cc = *(++ptr); /* in initializers */
527 #if !EBCDIC /* ASCII coding */
528 if (cc >= 'a') cc -= 32; /* Convert to upper case */
529 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
530 #else /* EBCDIC coding */
531 if (cc <= 'z') cc += 64; /* Convert to upper case */
532 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
537 /* Other special escapes not starting with a digit are straightforward */
543 *errorcodeptr = ERR2;
547 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
548 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
549 (However, an EBCDIC equivalent has now been added.) */
551 #if !EBCDIC /* ASCII coding */
552 if (c >= 'a' && c <= 'z') c -= 32;
554 #else /* EBCDIC coding */
555 if (c >= 'a' && c <= 'z') c += 64;
560 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
561 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
562 for Perl compatibility, it is a literal. This code looks a bit odd, but
563 there used to be some cases other than the default, and there may be again
564 in future, so I haven't "optimized" it. */
567 if ((options & PCRE_EXTRA) != 0) switch(c)
570 *errorcodeptr = ERR3;
584 /*************************************************
586 *************************************************/
588 /* This function is called after \P or \p has been encountered, provided that
589 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
590 pointing at the P or p. On exit, it is pointing at the final character of the
594 ptrptr points to the pattern position pointer
595 negptr points to a boolean that is set TRUE for negation else FALSE
596 dptr points to an int that is set to the detailed property value
597 errorcodeptr points to the error code variable
599 Returns: type value from ucp_type_table, or -1 for an invalid type
603 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
606 const uschar *ptr = *ptrptr;
610 if (c == 0) goto ERROR_RETURN;
614 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
624 for (i = 0; i < sizeof(name) - 1; i++)
627 if (c == 0) goto ERROR_RETURN;
631 if (c !='}') goto ERROR_RETURN;
635 /* Otherwise there is just one following character */
645 /* Search for a recognized property name using binary chop */
648 top = _pcre_utt_size;
652 i = (bot + top) >> 1;
653 c = strcmp(name, _pcre_utt[i].name);
656 *dptr = _pcre_utt[i].value;
657 return _pcre_utt[i].type;
659 if (c > 0) bot = i + 1; else top = i;
662 *errorcodeptr = ERR47;
667 *errorcodeptr = ERR46;
676 /*************************************************
677 * Check for counted repeat *
678 *************************************************/
680 /* This function is called when a '{' is encountered in a place where it might
681 start a quantifier. It looks ahead to see if it really is a quantifier or not.
682 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
683 where the ddds are digits.
686 p pointer to the first char after '{'
688 Returns: TRUE or FALSE
692 is_counted_repeat(const uschar *p)
694 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
695 while ((digitab[*p] & ctype_digit) != 0) p++;
696 if (*p == '}') return TRUE;
698 if (*p++ != ',') return FALSE;
699 if (*p == '}') return TRUE;
701 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
702 while ((digitab[*p] & ctype_digit) != 0) p++;
709 /*************************************************
710 * Read repeat counts *
711 *************************************************/
713 /* Read an item of the form {n,m} and return the values. This is called only
714 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
715 so the syntax is guaranteed to be correct, but we need to check the values.
718 p pointer to first char after '{'
719 minp pointer to int for min
720 maxp pointer to int for max
721 returned as -1 if no max
722 errorcodeptr points to error code variable
724 Returns: pointer to '}' on success;
725 current ptr on error, with errorcodeptr set non-zero
728 static const uschar *
729 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
734 /* Read the minimum value and do a paranoid check: a negative value indicates
735 an integer overflow. */
737 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
738 if (min < 0 || min > 65535)
740 *errorcodeptr = ERR5;
744 /* Read the maximum value if there is one, and again do a paranoid on its size.
745 Also, max must not be less than min. */
747 if (*p == '}') max = min; else
752 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
753 if (max < 0 || max > 65535)
755 *errorcodeptr = ERR5;
760 *errorcodeptr = ERR4;
766 /* Fill in the required variables, and pass back the pointer to the terminating
776 /*************************************************
777 * Find forward referenced named subpattern *
778 *************************************************/
780 /* This function scans along a pattern looking for capturing subpatterns, and
781 counting them. If it finds a named pattern that matches the name it is given,
782 it returns its number. This is used for forward references to named
783 subpatterns. We know that if (?P< is encountered, the name will be terminated
784 by '>' because that is checked in the first pass.
787 pointer current position in the pattern
788 count current count of capturing parens
792 Returns: the number of the named subpattern, or -1 if not found
796 find_named_parens(const uschar *ptr, int count, const uschar *name, int namelen)
798 const uschar *thisname;
799 for (; *ptr != 0; ptr++)
801 if (*ptr == '\\' && ptr[1] != 0) { ptr++; continue; }
802 if (*ptr != '(') continue;
803 if (ptr[1] != '?') { count++; continue; }
804 if (ptr[2] == '(') { ptr += 2; continue; }
805 if (ptr[2] != 'P' || ptr[3] != '<') continue;
809 while (*ptr != '>') ptr++;
810 if (namelen == ptr - thisname &&
811 strncmp((char *)name, (char*)thisname, namelen) == 0)
819 /*************************************************
820 * Find first significant op code *
821 *************************************************/
823 /* This is called by several functions that scan a compiled expression looking
824 for a fixed first character, or an anchoring op code etc. It skips over things
825 that do not influence this. For some calls, a change of option is important.
826 For some calls, it makes sense to skip negative forward and all backward
827 assertions, and also the \b assertion; for others it does not.
830 code pointer to the start of the group
831 options pointer to external options
832 optbit the option bit whose changing is significant, or
834 skipassert TRUE if certain assertions are to be skipped
836 Returns: pointer to the first significant opcode
840 first_significant_code(const uschar *code, int *options, int optbit,
848 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
849 *options = (int)code[1];
855 case OP_ASSERTBACK_NOT:
856 if (!skipassert) return code;
857 do code += GET(code, 1); while (*code == OP_ALT);
858 code += _pcre_OP_lengths[*code];
861 case OP_WORD_BOUNDARY:
862 case OP_NOT_WORD_BOUNDARY:
863 if (!skipassert) return code;
869 code += _pcre_OP_lengths[*code];
876 /* Control never reaches here */
882 /*************************************************
883 * Find the fixed length of a pattern *
884 *************************************************/
886 /* Scan a pattern and compute the fixed length of subject that will match it,
887 if the length is fixed. This is needed for dealing with backward assertions.
888 In UTF8 mode, the result is in characters rather than bytes.
891 code points to the start of the pattern (the bracket)
892 options the compiling options
894 Returns: the fixed length, or -1 if there is no fixed length,
895 or -2 if \C was encountered
899 find_fixedlength(uschar *code, int options)
903 register int branchlength = 0;
904 register uschar *cc = code + 1 + LINK_SIZE;
906 /* Scan along the opcodes for this branch. If we get to the end of the
907 branch, check the length against that of the other branches. */
912 register int op = *cc;
913 if (op >= OP_BRA) op = OP_BRA;
920 d = find_fixedlength(cc, options);
923 do cc += GET(cc, 1); while (*cc == OP_ALT);
927 /* Reached end of a branch; if it's a ket it is the end of a nested
928 call. If it's ALT it is an alternation in a nested call. If it is
929 END it's the end of the outer call. All can be handled by the same code. */
936 if (length < 0) length = branchlength;
937 else if (length != branchlength) return -1;
938 if (*cc != OP_ALT) return length;
943 /* Skip over assertive subpatterns */
948 case OP_ASSERTBACK_NOT:
949 do cc += GET(cc, 1); while (*cc == OP_ALT);
952 /* Skip over things that don't match chars */
965 case OP_NOT_WORD_BOUNDARY:
966 case OP_WORD_BOUNDARY:
967 cc += _pcre_OP_lengths[*cc];
970 /* Handle literal characters */
978 if ((options & PCRE_UTF8) != 0)
980 while ((*cc & 0xc0) == 0x80) cc++;
985 /* Handle exact repetitions. The count is already in characters, but we
986 need to skip over a multibyte character in UTF8 mode. */
989 branchlength += GET2(cc,1);
992 if ((options & PCRE_UTF8) != 0)
994 while((*cc & 0x80) == 0x80) cc++;
1000 branchlength += GET2(cc,1);
1004 /* Handle single-char matchers */
1013 case OP_NOT_WHITESPACE:
1015 case OP_NOT_WORDCHAR:
1022 /* The single-byte matcher isn't allowed */
1027 /* Check a class for variable quantification */
1031 cc += GET(cc, 1) - 33;
1049 if (GET2(cc,1) != GET2(cc,3)) return -1;
1050 branchlength += GET2(cc,1);
1059 /* Anything else is variable length */
1065 /* Control never gets here */
1071 /*************************************************
1072 * Scan compiled regex for numbered bracket *
1073 *************************************************/
1075 /* This little function scans through a compiled pattern until it finds a
1076 capturing bracket with the given number.
1079 code points to start of expression
1080 utf8 TRUE in UTF-8 mode
1081 number the required bracket number
1083 Returns: pointer to the opcode for the bracket, or NULL if not found
1086 static const uschar *
1087 find_bracket(const uschar *code, BOOL utf8, int number)
1091 register int c = *code;
1092 if (c == OP_END) return NULL;
1094 /* XCLASS is used for classes that cannot be represented just by a bit
1095 map. This includes negated single high-valued characters. The length in
1096 the table is zero; the actual length is stored in the compiled code. */
1098 if (c == OP_XCLASS) code += GET(code, 1);
1100 /* Handle bracketed group */
1102 else if (c > OP_BRA)
1105 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1106 if (n == number) return (uschar *)code;
1107 code += _pcre_OP_lengths[OP_BRA];
1110 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1111 that are followed by a character may be followed by a multi-byte character.
1112 The length in the table is a minimum, so we have to scan along to skip the
1113 extra bytes. All opcodes are less than 128, so we can use relatively
1118 code += _pcre_OP_lengths[c];
1132 while ((*code & 0xc0) == 0x80) code++;
1141 /*************************************************
1142 * Scan compiled regex for recursion reference *
1143 *************************************************/
1145 /* This little function scans through a compiled pattern until it finds an
1146 instance of OP_RECURSE.
1149 code points to start of expression
1150 utf8 TRUE in UTF-8 mode
1152 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1155 static const uschar *
1156 find_recurse(const uschar *code, BOOL utf8)
1160 register int c = *code;
1161 if (c == OP_END) return NULL;
1162 if (c == OP_RECURSE) return code;
1164 /* XCLASS is used for classes that cannot be represented just by a bit
1165 map. This includes negated single high-valued characters. The length in
1166 the table is zero; the actual length is stored in the compiled code. */
1168 if (c == OP_XCLASS) code += GET(code, 1);
1170 /* All bracketed groups have the same length. */
1172 else if (c > OP_BRA)
1174 code += _pcre_OP_lengths[OP_BRA];
1177 /* Otherwise, we get the item's length from the table. In UTF-8 mode, opcodes
1178 that are followed by a character may be followed by a multi-byte character.
1179 The length in the table is a minimum, so we have to scan along to skip the
1180 extra bytes. All opcodes are less than 128, so we can use relatively
1185 code += _pcre_OP_lengths[c];
1199 while ((*code & 0xc0) == 0x80) code++;
1208 /*************************************************
1209 * Scan compiled branch for non-emptiness *
1210 *************************************************/
1212 /* This function scans through a branch of a compiled pattern to see whether it
1213 can match the empty string or not. It is called only from could_be_empty()
1214 below. Note that first_significant_code() skips over assertions. If we hit an
1215 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1216 whose current branch will already have been scanned.
1219 code points to start of search
1220 endcode points to where to stop
1221 utf8 TRUE if in UTF8 mode
1223 Returns: TRUE if what is matched could be empty
1227 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1230 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1232 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1234 const uschar *ccode;
1241 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1243 /* Scan a closed bracket */
1245 empty_branch = FALSE;
1248 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1249 empty_branch = TRUE;
1250 code += GET(code, 1);
1252 while (*code == OP_ALT);
1253 if (!empty_branch) return FALSE; /* All branches are non-empty */
1254 code += 1 + LINK_SIZE;
1260 /* Check for quantifiers after a class */
1264 ccode = code + GET(code, 1);
1265 goto CHECK_CLASS_REPEAT;
1278 case OP_CRSTAR: /* These could be empty; continue */
1284 default: /* Non-repeat => class must match */
1285 case OP_CRPLUS: /* These repeats aren't empty */
1291 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1296 /* Opcodes that must match a character */
1303 case OP_NOT_WHITESPACE:
1305 case OP_NOT_WORDCHAR:
1319 case OP_TYPEMINPLUS:
1331 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1332 followed by a multibyte character */
1341 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1352 /*************************************************
1353 * Scan compiled regex for non-emptiness *
1354 *************************************************/
1356 /* This function is called to check for left recursive calls. We want to check
1357 the current branch of the current pattern to see if it could match the empty
1358 string. If it could, we must look outwards for branches at other levels,
1359 stopping when we pass beyond the bracket which is the subject of the recursion.
1362 code points to start of the recursion
1363 endcode points to where to stop (current RECURSE item)
1364 bcptr points to the chain of current (unclosed) branch starts
1365 utf8 TRUE if in UTF-8 mode
1367 Returns: TRUE if what is matched could be empty
1371 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1374 while (bcptr != NULL && bcptr->current >= code)
1376 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1377 bcptr = bcptr->outer;
1384 /*************************************************
1385 * Check for POSIX class syntax *
1386 *************************************************/
1388 /* This function is called when the sequence "[:" or "[." or "[=" is
1389 encountered in a character class. It checks whether this is followed by an
1390 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1394 ptr pointer to the initial [
1395 endptr where to return the end pointer
1396 cd pointer to compile data
1398 Returns: TRUE or FALSE
1402 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1404 int terminator; /* Don't combine these lines; the Solaris cc */
1405 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1406 if (*(++ptr) == '^') ptr++;
1407 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1408 if (*ptr == terminator && ptr[1] == ']')
1419 /*************************************************
1420 * Check POSIX class name *
1421 *************************************************/
1423 /* This function is called to check the name given in a POSIX-style class entry
1427 ptr points to the first letter
1428 len the length of the name
1430 Returns: a value representing the name, or -1 if unknown
1434 check_posix_name(const uschar *ptr, int len)
1436 register int yield = 0;
1437 while (posix_name_lengths[yield] != 0)
1439 if (len == posix_name_lengths[yield] &&
1440 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1447 /*************************************************
1448 * Adjust OP_RECURSE items in repeated group *
1449 *************************************************/
1451 /* OP_RECURSE items contain an offset from the start of the regex to the group
1452 that is referenced. This means that groups can be replicated for fixed
1453 repetition simply by copying (because the recursion is allowed to refer to
1454 earlier groups that are outside the current group). However, when a group is
1455 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1456 it, after it has been compiled. This means that any OP_RECURSE items within it
1457 that refer to the group itself or any contained groups have to have their
1458 offsets adjusted. That is the job of this function. Before it is called, the
1459 partially compiled regex must be temporarily terminated with OP_END.
1462 group points to the start of the group
1463 adjust the amount by which the group is to be moved
1464 utf8 TRUE in UTF-8 mode
1465 cd contains pointers to tables etc.
1471 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1473 uschar *ptr = group;
1474 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1476 int offset = GET(ptr, 1);
1477 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1478 ptr += 1 + LINK_SIZE;
1484 /*************************************************
1485 * Insert an automatic callout point *
1486 *************************************************/
1488 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1489 callout points before each pattern item.
1492 code current code pointer
1493 ptr current pattern pointer
1494 cd pointers to tables etc
1496 Returns: new code pointer
1500 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1502 *code++ = OP_CALLOUT;
1504 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1505 PUT(code, LINK_SIZE, 0); /* Default length */
1506 return code + 2*LINK_SIZE;
1511 /*************************************************
1512 * Complete a callout item *
1513 *************************************************/
1515 /* A callout item contains the length of the next item in the pattern, which
1516 we can't fill in till after we have reached the relevant point. This is used
1517 for both automatic and manual callouts.
1520 previous_callout points to previous callout item
1521 ptr current pattern pointer
1522 cd pointers to tables etc
1528 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1530 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1531 PUT(previous_callout, 2 + LINK_SIZE, length);
1537 /*************************************************
1538 * Get othercase range *
1539 *************************************************/
1541 /* This function is passed the start and end of a class range, in UTF-8 mode
1542 with UCP support. It searches up the characters, looking for internal ranges of
1543 characters in the "other" case. Each call returns the next one, updating the
1547 cptr points to starting character value; updated
1549 ocptr where to put start of othercase range
1550 odptr where to put end of othercase range
1552 Yield: TRUE when range returned; FALSE when no more
1556 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
1558 int c, othercase, next;
1560 for (c = *cptr; c <= d; c++)
1561 { if ((othercase = _pcre_ucp_othercase(c)) >= 0) break; }
1563 if (c > d) return FALSE;
1566 next = othercase + 1;
1568 for (++c; c <= d; c++)
1570 if (_pcre_ucp_othercase(c) != next) break;
1579 #endif /* SUPPORT_UCP */
1582 /*************************************************
1583 * Compile one branch *
1584 *************************************************/
1586 /* Scan the pattern, compiling it into the code vector. If the options are
1587 changed during the branch, the pointer is used to change the external options
1591 optionsptr pointer to the option bits
1592 brackets points to number of extracting brackets used
1593 codeptr points to the pointer to the current code point
1594 ptrptr points to the current pattern pointer
1595 errorcodeptr points to error code variable
1596 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
1597 reqbyteptr set to the last literal character required, else < 0
1598 bcptr points to current branch chain
1599 cd contains pointers to tables etc.
1601 Returns: TRUE on success
1602 FALSE, with *errorcodeptr set non-zero on error
1606 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
1607 const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
1608 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
1610 int repeat_type, op_type;
1611 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
1613 int greedy_default, greedy_non_default;
1614 int firstbyte, reqbyte;
1615 int zeroreqbyte, zerofirstbyte;
1616 int req_caseopt, reqvary, tempreqvary;
1617 int options = *optionsptr;
1618 int after_manual_callout = 0;
1620 register uschar *code = *codeptr;
1622 BOOL inescq = FALSE;
1623 BOOL groupsetfirstbyte = FALSE;
1624 const uschar *ptr = *ptrptr;
1625 const uschar *tempptr;
1626 uschar *previous = NULL;
1627 uschar *previous_callout = NULL;
1628 uschar classbits[32];
1632 BOOL utf8 = (options & PCRE_UTF8) != 0;
1633 uschar *class_utf8data;
1634 uschar utf8_char[6];
1639 /* Set up the default and non-default settings for greediness */
1641 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1642 greedy_non_default = greedy_default ^ 1;
1644 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
1645 matching encountered yet". It gets changed to REQ_NONE if we hit something that
1646 matches a non-fixed char first char; reqbyte just remains unset if we never
1649 When we hit a repeat whose minimum is zero, we may have to adjust these values
1650 to take the zero repeat into account. This is implemented by setting them to
1651 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
1652 item types that can be repeated set these backoff variables appropriately. */
1654 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
1656 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
1657 according to the current setting of the caseless flag. REQ_CASELESS is a bit
1658 value > 255. It is added into the firstbyte or reqbyte variables to record the
1659 case status of the value. This is used only for ASCII characters. */
1661 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
1663 /* Switch on next character until the end of the branch */
1668 BOOL possessive_quantifier;
1670 int class_charcount;
1680 /* Next byte in the pattern */
1684 /* If in \Q...\E, check for the end; if not, we have a literal */
1686 if (inescq && c != 0)
1688 if (c == '\\' && ptr[1] == 'E')
1696 if (previous_callout != NULL)
1698 complete_callout(previous_callout, ptr, cd);
1699 previous_callout = NULL;
1701 if ((options & PCRE_AUTO_CALLOUT) != 0)
1703 previous_callout = code;
1704 code = auto_callout(code, ptr, cd);
1710 /* Fill in length of a previous callout, except when the next thing is
1713 is_quantifier = c == '*' || c == '+' || c == '?' ||
1714 (c == '{' && is_counted_repeat(ptr+1));
1716 if (!is_quantifier && previous_callout != NULL &&
1717 after_manual_callout-- <= 0)
1719 complete_callout(previous_callout, ptr, cd);
1720 previous_callout = NULL;
1723 /* In extended mode, skip white space and comments */
1725 if ((options & PCRE_EXTENDED) != 0)
1727 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1730 while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
1733 ptr += cd->nllen - 1;
1736 /* Else fall through to handle end of string */
1741 /* No auto callout for quantifiers. */
1743 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
1745 previous_callout = code;
1746 code = auto_callout(code, ptr, cd);
1751 /* The branch terminates at end of string, |, or ). */
1756 *firstbyteptr = firstbyte;
1757 *reqbyteptr = reqbyte;
1762 /* Handle single-character metacharacters. In multiline mode, ^ disables
1763 the setting of any following char as a first character. */
1766 if ((options & PCRE_MULTILINE) != 0)
1768 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1779 /* There can never be a first char if '.' is first, whatever happens about
1780 repeats. The value of reqbyte doesn't change either. */
1783 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
1784 zerofirstbyte = firstbyte;
1785 zeroreqbyte = reqbyte;
1790 /* Character classes. If the included characters are all < 256, we build a
1791 32-byte bitmap of the permitted characters, except in the special case
1792 where there is only one such character. For negated classes, we build the
1793 map as usual, then invert it at the end. However, we use a different opcode
1794 so that data characters > 255 can be handled correctly.
1796 If the class contains characters outside the 0-255 range, a different
1797 opcode is compiled. It may optionally have a bit map for characters < 256,
1798 but those above are are explicitly listed afterwards. A flag byte tells
1799 whether the bitmap is present, and whether this is a negated class or not.
1805 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
1806 they are encountered at the top level, so we'll do that too. */
1808 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1809 check_posix_syntax(ptr, &tempptr, cd))
1811 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
1815 /* If the first character is '^', set the negation flag and skip it. */
1817 if ((c = *(++ptr)) == '^')
1819 negate_class = TRUE;
1824 negate_class = FALSE;
1827 /* Keep a count of chars with values < 256 so that we can optimize the case
1828 of just a single character (as long as it's < 256). For higher valued UTF-8
1829 characters, we don't yet do any optimization. */
1831 class_charcount = 0;
1832 class_lastchar = -1;
1835 class_utf8 = FALSE; /* No chars >= 256 */
1836 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
1839 /* Initialize the 32-char bit map to all zeros. We have to build the
1840 map in a temporary bit of store, in case the class contains only 1
1841 character (< 256), because in that case the compiled code doesn't use the
1844 memset(classbits, 0, 32 * sizeof(uschar));
1846 /* Process characters until ] is reached. By writing this as a "do" it
1847 means that an initial ] is taken as a data character. The first pass
1848 through the regex checked the overall syntax, so we don't need to be very
1849 strict here. At the start of the loop, c contains the first byte of the
1855 if (utf8 && c > 127)
1856 { /* Braces are required because the */
1857 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
1861 /* Inside \Q...\E everything is literal except \E */
1865 if (c == '\\' && ptr[1] == 'E')
1871 else goto LONE_SINGLE_CHARACTER;
1874 /* Handle POSIX class names. Perl allows a negation extension of the
1875 form [:^name:]. A square bracket that doesn't match the syntax is
1876 treated as a literal. We also recognize the POSIX constructions
1877 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1881 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1882 check_posix_syntax(ptr, &tempptr, cd))
1884 BOOL local_negate = FALSE;
1885 int posix_class, taboffset, tabopt;
1886 register const uschar *cbits = cd->cbits;
1891 *errorcodeptr = ERR31;
1898 local_negate = TRUE;
1902 posix_class = check_posix_name(ptr, tempptr - ptr);
1903 if (posix_class < 0)
1905 *errorcodeptr = ERR30;
1909 /* If matching is caseless, upper and lower are converted to
1910 alpha. This relies on the fact that the class table starts with
1911 alpha, lower, upper as the first 3 entries. */
1913 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1916 /* We build the bit map for the POSIX class in a chunk of local store
1917 because we may be adding and subtracting from it, and we don't want to
1918 subtract bits that may be in the main map already. At the end we or the
1919 result into the bit map that is being built. */
1923 /* Copy in the first table (always present) */
1925 memcpy(pbits, cbits + posix_class_maps[posix_class],
1926 32 * sizeof(uschar));
1928 /* If there is a second table, add or remove it as required. */
1930 taboffset = posix_class_maps[posix_class + 1];
1931 tabopt = posix_class_maps[posix_class + 2];
1936 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
1938 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
1941 /* Not see if we need to remove any special characters. An option
1942 value of 1 removes vertical space and 2 removes underscore. */
1944 if (tabopt < 0) tabopt = -tabopt;
1945 if (tabopt == 1) pbits[1] &= ~0x3c;
1946 else if (tabopt == 2) pbits[11] &= 0x7f;
1948 /* Add the POSIX table or its complement into the main table that is
1949 being built and we are done. */
1952 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
1954 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
1957 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1958 continue; /* End of POSIX syntax handling */
1961 /* Backslash may introduce a single character, or it may introduce one
1962 of the specials, which just set a flag. Escaped items are checked for
1963 validity in the pre-compiling pass. The sequence \b is a special case.
1964 Inside a class (and only there) it is treated as backspace. Elsewhere
1965 it marks a word boundary. Other escapes have preset maps ready to
1966 or into the one we are building. We assume they have more than one
1967 character in them, so set class_charcount bigger than one. */
1971 c = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
1973 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
1974 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
1975 else if (-c == ESC_Q) /* Handle start of quoted string */
1977 if (ptr[1] == '\\' && ptr[2] == 'E')
1979 ptr += 2; /* avoid empty string */
1987 register const uschar *cbits = cd->cbits;
1988 class_charcount += 2; /* Greater than 1 is what matters */
1992 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
1996 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2000 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2004 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2008 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2009 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2013 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2014 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2023 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2024 if (ptype < 0) goto FAILED;
2026 *class_utf8data++ = ((-c == ESC_p) != negated)?
2027 XCL_PROP : XCL_NOTPROP;
2028 *class_utf8data++ = ptype;
2029 *class_utf8data++ = pdata;
2030 class_charcount -= 2; /* Not a < 256 character */
2035 /* Unrecognized escapes are faulted if PCRE is running in its
2036 strict mode. By default, for compatibility with Perl, they are
2037 treated as literals. */
2040 if ((options & PCRE_EXTRA) != 0)
2042 *errorcodeptr = ERR7;
2045 c = *ptr; /* The final character */
2046 class_charcount -= 2; /* Undo the default count from above */
2050 /* Fall through if we have a single character (c >= 0). This may be
2051 > 256 in UTF-8 mode. */
2053 } /* End of backslash handling */
2055 /* A single character may be followed by '-' to form a range. However,
2056 Perl does not permit ']' to be the end of the range. A '-' character
2057 here is treated as a literal. */
2059 if (ptr[1] == '-' && ptr[2] != ']')
2066 { /* Braces are required because the */
2067 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2071 d = *ptr; /* Not UTF-8 mode */
2073 /* The second part of a range can be a single-character escape, but
2074 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2075 in such circumstances. */
2079 const uschar *oldptr = ptr;
2080 d = check_escape(&ptr, errorcodeptr, *brackets, options, TRUE);
2082 /* \b is backslash; \X is literal X; any other special means the '-'
2087 if (d == -ESC_b) d = '\b';
2088 else if (d == -ESC_X) d = 'X'; else
2091 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2096 /* The check that the two values are in the correct order happens in
2097 the pre-pass. Optimize one-character ranges */
2099 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2101 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2102 matching, we have to use an XCLASS with extra data items. Caseless
2103 matching for characters > 127 is available only if UCP support is
2107 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2111 /* With UCP support, we can find the other case equivalents of
2112 the relevant characters. There may be several ranges. Optimize how
2113 they fit with the basic range. */
2116 if ((options & PCRE_CASELESS) != 0)
2121 while (get_othercase_range(&cc, origd, &occ, &ocd))
2123 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2125 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2126 { /* if there is overlap, */
2127 c = occ; /* noting that if occ < c */
2128 continue; /* we can't have ocd > d */
2129 } /* because a subrange is */
2130 if (ocd > d && occ <= d + 1) /* always shorter than */
2131 { /* the basic range. */
2138 *class_utf8data++ = XCL_SINGLE;
2142 *class_utf8data++ = XCL_RANGE;
2143 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
2145 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
2148 #endif /* SUPPORT_UCP */
2150 /* Now record the original range, possibly modified for UCP caseless
2151 overlapping ranges. */
2153 *class_utf8data++ = XCL_RANGE;
2154 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2155 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
2157 /* With UCP support, we are done. Without UCP support, there is no
2158 caseless matching for UTF-8 characters > 127; we can use the bit map
2159 for the smaller ones. */
2162 continue; /* With next character in the class */
2164 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2166 /* Adjust upper limit and fall through to set up the map */
2170 #endif /* SUPPORT_UCP */
2172 #endif /* SUPPORT_UTF8 */
2174 /* We use the bit map for all cases when not in UTF-8 mode; else
2175 ranges that lie entirely within 0-127 when there is UCP support; else
2176 for partial ranges without UCP support. */
2180 classbits[c/8] |= (1 << (c&7));
2181 if ((options & PCRE_CASELESS) != 0)
2183 int uc = cd->fcc[c]; /* flip case */
2184 classbits[uc/8] |= (1 << (uc&7));
2186 class_charcount++; /* in case a one-char range */
2190 continue; /* Go get the next char in the class */
2193 /* Handle a lone single character - we can get here for a normal
2194 non-escape char, or after \ that introduces a single character or for an
2195 apparent range that isn't. */
2197 LONE_SINGLE_CHARACTER:
2199 /* Handle a character that cannot go in the bit map */
2202 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2205 *class_utf8data++ = XCL_SINGLE;
2206 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
2209 if ((options & PCRE_CASELESS) != 0)
2212 if ((othercase = _pcre_ucp_othercase(c)) >= 0)
2214 *class_utf8data++ = XCL_SINGLE;
2215 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
2218 #endif /* SUPPORT_UCP */
2222 #endif /* SUPPORT_UTF8 */
2224 /* Handle a single-byte character */
2226 classbits[c/8] |= (1 << (c&7));
2227 if ((options & PCRE_CASELESS) != 0)
2229 c = cd->fcc[c]; /* flip case */
2230 classbits[c/8] |= (1 << (c&7));
2237 /* Loop until ']' reached; the check for end of string happens inside the
2238 loop. This "while" is the end of the "do" above. */
2240 while ((c = *(++ptr)) != ']' || inescq);
2242 /* If class_charcount is 1, we saw precisely one character whose value is
2243 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2244 can optimize the negative case only if there were no characters >= 128
2245 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2246 single-bytes only. This is an historical hangover. Maybe one day we can
2247 tidy these opcodes to handle multi-byte characters.
2249 The optimization throws away the bit map. We turn the item into a
2250 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2251 that OP_NOT does not support multibyte characters. In the positive case, it
2252 can cause firstbyte to be set. Otherwise, there can be no first char if
2253 this item is first, whatever repeat count may follow. In the case of
2254 reqbyte, save the previous value for reinstating. */
2257 if (class_charcount == 1 &&
2259 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2262 if (class_charcount == 1)
2265 zeroreqbyte = reqbyte;
2267 /* The OP_NOT opcode works on one-byte characters only. */
2271 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2272 zerofirstbyte = firstbyte;
2274 *code++ = class_lastchar;
2278 /* For a single, positive character, get the value into mcbuffer, and
2279 then we can handle this with the normal one-character code. */
2282 if (utf8 && class_lastchar > 127)
2283 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
2287 mcbuffer[0] = class_lastchar;
2291 } /* End of 1-char optimization */
2293 /* The general case - not the one-char optimization. If this is the first
2294 thing in the branch, there can be no first char setting, whatever the
2295 repeat count. Any reqbyte setting must remain unchanged after any kind of
2298 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2299 zerofirstbyte = firstbyte;
2300 zeroreqbyte = reqbyte;
2302 /* If there are characters with values > 255, we have to compile an
2303 extended class, with its own opcode. If there are no characters < 256,
2304 we can omit the bitmap. */
2309 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2310 *code++ = OP_XCLASS;
2312 *code = negate_class? XCL_NOT : 0;
2314 /* If the map is required, install it, and move on to the end of
2317 if (class_charcount > 0)
2320 memcpy(code, classbits, 32);
2321 code = class_utf8data;
2324 /* If the map is not required, slide down the extra data. */
2328 int len = class_utf8data - (code + 33);
2329 memmove(code + 1, code + 33, len);
2333 /* Now fill in the complete length of the item */
2335 PUT(previous, 1, code - previous);
2336 break; /* End of class handling */
2340 /* If there are no characters > 255, negate the 32-byte map if necessary,
2341 and copy it into the code vector. If this is the first thing in the branch,
2342 there can be no first char setting, whatever the repeat count. Any reqbyte
2343 setting must remain unchanged after any kind of repeat. */
2347 *code++ = OP_NCLASS;
2348 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2353 memcpy(code, classbits, 32);
2358 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2359 has been tested above. */
2362 if (!is_quantifier) goto NORMAL_CHAR;
2363 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
2364 if (*errorcodeptr != 0) goto FAILED;
2382 if (previous == NULL)
2384 *errorcodeptr = ERR9;
2388 if (repeat_min == 0)
2390 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2391 reqbyte = zeroreqbyte; /* Ditto */
2394 /* Remember whether this is a variable length repeat */
2396 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2398 op_type = 0; /* Default single-char op codes */
2399 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2401 /* Save start of previous item, in case we have to move it up to make space
2402 for an inserted OP_ONCE for the additional '+' extension. */
2404 tempcode = previous;
2406 /* If the next character is '+', we have a possessive quantifier. This
2407 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2408 If the next character is '?' this is a minimizing repeat, by default,
2409 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2410 repeat type to the non-default. */
2414 repeat_type = 0; /* Force greedy */
2415 possessive_quantifier = TRUE;
2418 else if (ptr[1] == '?')
2420 repeat_type = greedy_non_default;
2423 else repeat_type = greedy_default;
2425 /* If previous was a recursion, we need to wrap it inside brackets so that
2426 it can be replicated if necessary. */
2428 if (*previous == OP_RECURSE)
2430 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2431 code += 1 + LINK_SIZE;
2433 PUT(previous, 1, code - previous);
2435 PUT(code, 1, code - previous);
2436 code += 1 + LINK_SIZE;
2439 /* If previous was a character match, abolish the item and generate a
2440 repeat item instead. If a char item has a minumum of more than one, ensure
2441 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2442 the first thing in a branch because the x will have gone into firstbyte
2445 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2447 /* Deal with UTF-8 characters that take up more than one byte. It's
2448 easier to write this out separately than try to macrify it. Use c to
2449 hold the length of the character in bytes, plus 0x80 to flag that it's a
2450 length rather than a small character. */
2453 if (utf8 && (code[-1] & 0x80) != 0)
2455 uschar *lastchar = code - 1;
2456 while((*lastchar & 0xc0) == 0x80) lastchar--;
2457 c = code - lastchar; /* Length of UTF-8 character */
2458 memcpy(utf8_char, lastchar, c); /* Save the char */
2459 c |= 0x80; /* Flag c as a length */
2464 /* Handle the case of a single byte - either with no UTF8 support, or
2465 with UTF-8 disabled, or for a UTF-8 character < 128. */
2469 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2472 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2475 /* If previous was a single negated character ([^a] or similar), we use
2476 one of the special opcodes, replacing it. The code is shared with single-
2477 character repeats by setting opt_type to add a suitable offset into
2478 repeat_type. OP_NOT is currently used only for single-byte chars. */
2480 else if (*previous == OP_NOT)
2482 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2484 goto OUTPUT_SINGLE_REPEAT;
2487 /* If previous was a character type match (\d or similar), abolish it and
2488 create a suitable repeat item. The code is shared with single-character
2489 repeats by setting op_type to add a suitable offset into repeat_type. Note
2490 the the Unicode property types will be present only when SUPPORT_UCP is
2491 defined, but we don't wrap the little bits of code here because it just
2492 makes it horribly messy. */
2494 else if (*previous < OP_EODN)
2497 int prop_type, prop_value;
2498 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2501 OUTPUT_SINGLE_REPEAT:
2502 if (*previous == OP_PROP || *previous == OP_NOTPROP)
2504 prop_type = previous[1];
2505 prop_value = previous[2];
2507 else prop_type = prop_value = -1;
2510 code = previous; /* Usually overwrite previous item */
2512 /* If the maximum is zero then the minimum must also be zero; Perl allows
2513 this case, so we do too - by simply omitting the item altogether. */
2515 if (repeat_max == 0) goto END_REPEAT;
2517 /* All real repeats make it impossible to handle partial matching (maybe
2518 one day we will be able to remove this restriction). */
2520 if (repeat_max != 1) cd->nopartial = TRUE;
2522 /* Combine the op_type with the repeat_type */
2524 repeat_type += op_type;
2526 /* A minimum of zero is handled either as the special case * or ?, or as
2527 an UPTO, with the maximum given. */
2529 if (repeat_min == 0)
2531 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2532 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2535 *code++ = OP_UPTO + repeat_type;
2536 PUT2INC(code, 0, repeat_max);
2540 /* A repeat minimum of 1 is optimized into some special cases. If the
2541 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2542 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2543 one less than the maximum. */
2545 else if (repeat_min == 1)
2547 if (repeat_max == -1)
2548 *code++ = OP_PLUS + repeat_type;
2551 code = oldcode; /* leave previous item in place */
2552 if (repeat_max == 1) goto END_REPEAT;
2553 *code++ = OP_UPTO + repeat_type;
2554 PUT2INC(code, 0, repeat_max - 1);
2558 /* The case {n,n} is just an EXACT, while the general case {n,m} is
2559 handled as an EXACT followed by an UPTO. */
2563 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
2564 PUT2INC(code, 0, repeat_min);
2566 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
2567 we have to insert the character for the previous code. For a repeated
2568 Unicode property match, there are two extra bytes that define the
2569 required property. In UTF-8 mode, long characters have their length in
2570 c, with the 0x80 bit as a flag. */
2575 if (utf8 && c >= 128)
2577 memcpy(code, utf8_char, c & 7);
2586 *code++ = prop_type;
2587 *code++ = prop_value;
2590 *code++ = OP_STAR + repeat_type;
2593 /* Else insert an UPTO if the max is greater than the min, again
2594 preceded by the character, for the previously inserted code. */
2596 else if (repeat_max != repeat_min)
2599 if (utf8 && c >= 128)
2601 memcpy(code, utf8_char, c & 7);
2609 *code++ = prop_type;
2610 *code++ = prop_value;
2612 repeat_max -= repeat_min;
2613 *code++ = OP_UPTO + repeat_type;
2614 PUT2INC(code, 0, repeat_max);
2618 /* The character or character type itself comes last in all cases. */
2621 if (utf8 && c >= 128)
2623 memcpy(code, utf8_char, c & 7);
2630 /* For a repeated Unicode property match, there are two extra bytes that
2631 define the required property. */
2636 *code++ = prop_type;
2637 *code++ = prop_value;
2642 /* If previous was a character class or a back reference, we put the repeat
2643 stuff after it, but just skip the item if the repeat was {0,0}. */
2645 else if (*previous == OP_CLASS ||
2646 *previous == OP_NCLASS ||
2648 *previous == OP_XCLASS ||
2650 *previous == OP_REF)
2652 if (repeat_max == 0)
2658 /* All real repeats make it impossible to handle partial matching (maybe
2659 one day we will be able to remove this restriction). */
2661 if (repeat_max != 1) cd->nopartial = TRUE;
2663 if (repeat_min == 0 && repeat_max == -1)
2664 *code++ = OP_CRSTAR + repeat_type;
2665 else if (repeat_min == 1 && repeat_max == -1)
2666 *code++ = OP_CRPLUS + repeat_type;
2667 else if (repeat_min == 0 && repeat_max == 1)
2668 *code++ = OP_CRQUERY + repeat_type;
2671 *code++ = OP_CRRANGE + repeat_type;
2672 PUT2INC(code, 0, repeat_min);
2673 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
2674 PUT2INC(code, 0, repeat_max);
2678 /* If previous was a bracket group, we may have to replicate it in certain
2681 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
2682 *previous == OP_COND)
2686 int len = code - previous;
2687 uschar *bralink = NULL;
2689 /* If the maximum repeat count is unlimited, find the end of the bracket
2690 by scanning through from the start, and compute the offset back to it
2691 from the current code pointer. There may be an OP_OPT setting following
2692 the final KET, so we can't find the end just by going back from the code
2695 if (repeat_max == -1)
2697 register uschar *ket = previous;
2698 do ket += GET(ket, 1); while (*ket != OP_KET);
2699 ketoffset = code - ket;
2702 /* The case of a zero minimum is special because of the need to stick
2703 OP_BRAZERO in front of it, and because the group appears once in the
2704 data, whereas in other cases it appears the minimum number of times. For
2705 this reason, it is simplest to treat this case separately, as otherwise
2706 the code gets far too messy. There are several special subcases when the
2709 if (repeat_min == 0)
2711 /* If the maximum is also zero, we just omit the group from the output
2714 if (repeat_max == 0)
2720 /* If the maximum is 1 or unlimited, we just have to stick in the
2721 BRAZERO and do no more at this point. However, we do need to adjust
2722 any OP_RECURSE calls inside the group that refer to the group itself or
2723 any internal group, because the offset is from the start of the whole
2724 regex. Temporarily terminate the pattern while doing this. */
2726 if (repeat_max <= 1)
2729 adjust_recurse(previous, 1, utf8, cd);
2730 memmove(previous+1, previous, len);
2732 *previous++ = OP_BRAZERO + repeat_type;
2735 /* If the maximum is greater than 1 and limited, we have to replicate
2736 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
2737 The first one has to be handled carefully because it's the original
2738 copy, which has to be moved up. The remainder can be handled by code
2739 that is common with the non-zero minimum case below. We have to
2740 adjust the value or repeat_max, since one less copy is required. Once
2741 again, we may have to adjust any OP_RECURSE calls inside the group. */
2747 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
2748 memmove(previous + 2 + LINK_SIZE, previous, len);
2749 code += 2 + LINK_SIZE;
2750 *previous++ = OP_BRAZERO + repeat_type;
2751 *previous++ = OP_BRA;
2753 /* We chain together the bracket offset fields that have to be
2754 filled in later when the ends of the brackets are reached. */
2756 offset = (bralink == NULL)? 0 : previous - bralink;
2758 PUTINC(previous, 0, offset);
2764 /* If the minimum is greater than zero, replicate the group as many
2765 times as necessary, and adjust the maximum to the number of subsequent
2766 copies that we need. If we set a first char from the group, and didn't
2767 set a required char, copy the latter from the former. */
2773 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
2774 for (i = 1; i < repeat_min; i++)
2776 memcpy(code, previous, len);
2780 if (repeat_max > 0) repeat_max -= repeat_min;
2783 /* This code is common to both the zero and non-zero minimum cases. If
2784 the maximum is limited, it replicates the group in a nested fashion,
2785 remembering the bracket starts on a stack. In the case of a zero minimum,
2786 the first one was set up above. In all cases the repeat_max now specifies
2787 the number of additional copies needed. */
2789 if (repeat_max >= 0)
2791 for (i = repeat_max - 1; i >= 0; i--)
2793 *code++ = OP_BRAZERO + repeat_type;
2795 /* All but the final copy start a new nesting, maintaining the
2796 chain of brackets outstanding. */
2802 offset = (bralink == NULL)? 0 : code - bralink;
2804 PUTINC(code, 0, offset);
2807 memcpy(code, previous, len);
2811 /* Now chain through the pending brackets, and fill in their length
2812 fields (which are holding the chain links pro tem). */
2814 while (bralink != NULL)
2817 int offset = code - bralink + 1;
2818 uschar *bra = code - offset;
2819 oldlinkoffset = GET(bra, 1);
2820 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
2822 PUTINC(code, 0, offset);
2823 PUT(bra, 1, offset);
2827 /* If the maximum is unlimited, set a repeater in the final copy. We
2828 can't just offset backwards from the current code point, because we
2829 don't know if there's been an options resetting after the ket. The
2830 correct offset was computed above. */
2832 else code[-ketoffset] = OP_KETRMAX + repeat_type;
2835 /* Else there's some kind of shambles */
2839 *errorcodeptr = ERR11;
2843 /* If the character following a repeat is '+', we wrap the entire repeated
2844 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
2845 Sun's Java package. The repeated item starts at tempcode, not at previous,
2846 which might be the first part of a string whose (former) last char we
2847 repeated. However, we don't support '+' after a greediness '?'. */
2849 if (possessive_quantifier)
2851 int len = code - tempcode;
2852 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
2853 code += 1 + LINK_SIZE;
2854 len += 1 + LINK_SIZE;
2855 tempcode[0] = OP_ONCE;
2857 PUTINC(code, 0, len);
2858 PUT(tempcode, 1, len);
2861 /* In all case we no longer have a previous item. We also set the
2862 "follows varying string" flag for subsequently encountered reqbytes if
2863 it isn't already set and we have just passed a varying length item. */
2867 cd->req_varyopt |= reqvary;
2871 /* Start of nested bracket sub-expression, or comment or lookahead or
2872 lookbehind or option setting or condition. First deal with special things
2873 that can come after a bracket; all are introduced by ?, and the appearance
2874 of any of them means that this is not a referencing group. They were
2875 checked for validity in the first pass over the string, so we don't have to
2876 check for syntax errors here. */
2879 newoptions = options;
2882 if (*(++ptr) == '?')
2889 case '#': /* Comment; skip to ket */
2891 while (*ptr != ')') ptr++;
2894 case ':': /* Non-extracting bracket */
2900 bravalue = OP_COND; /* Conditional group */
2902 /* A condition can be a number, referring to a numbered group, a name,
2903 referring to a named group, 'R', referring to recursion, or an
2904 assertion. There are two unfortunate ambiguities, caused by history.
2905 (a) 'R' can be the recursive thing or the name 'R', and (b) a number
2906 could be a name that consists of digits. In both cases, we look for a
2907 name first; if not found, we try the other cases. If the first
2908 character after (?( is a word character, we know the rest up to ) will
2909 also be word characters because the syntax was checked in the first
2912 if ((cd->ctypes[ptr[1]] & ctype_word) != 0)
2917 uschar *slot = cd->name_table;
2919 /* This is needed for all successful cases. */
2923 /* Read the name, but also get it as a number if it's all digits */
2929 condref = ((digitab[*ptr] & ctype_digit) != 0)?
2930 condref * 10 + *ptr - '0' : -1;
2933 namelen = ptr - name;
2936 for (i = 0; i < cd->names_found; i++)
2938 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
2939 slot += cd->name_entry_size;
2942 /* Found a previous named subpattern */
2944 if (i < cd->names_found)
2946 condref = GET2(slot, 0);
2947 code[1+LINK_SIZE] = OP_CREF;
2948 PUT2(code, 2+LINK_SIZE, condref);
2951 /* Search the pattern for a forward reference */
2953 else if ((i = find_named_parens(ptr, *brackets, name, namelen)) > 0)
2955 code[1+LINK_SIZE] = OP_CREF;
2956 PUT2(code, 2+LINK_SIZE, i);
2959 /* Check for 'R' for recursion */
2961 else if (namelen == 1 && *name == 'R')
2963 code[1+LINK_SIZE] = OP_CREF;
2964 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
2967 /* Check for a subpattern number */
2969 else if (condref > 0)
2971 code[1+LINK_SIZE] = OP_CREF;
2972 PUT2(code, 2+LINK_SIZE, condref);
2975 /* Either an unidentified subpattern, or a reference to (?(0) */
2979 *errorcodeptr = (condref == 0)? ERR35: ERR15;
2984 /* For conditions that are assertions, we just fall through, having
2985 set bravalue above. */
2989 case '=': /* Positive lookahead */
2990 bravalue = OP_ASSERT;
2994 case '!': /* Negative lookahead */
2995 bravalue = OP_ASSERT_NOT;
2999 case '<': /* Lookbehinds */
3002 case '=': /* Positive lookbehind */
3003 bravalue = OP_ASSERTBACK;
3007 case '!': /* Negative lookbehind */
3008 bravalue = OP_ASSERTBACK_NOT;
3014 case '>': /* One-time brackets */
3019 case 'C': /* Callout - may be followed by digits; */
3020 previous_callout = code; /* Save for later completion */
3021 after_manual_callout = 1; /* Skip one item before completing */
3022 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3023 { /* closing parenthesis is present. */
3025 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3026 n = n * 10 + *ptr - '0';
3029 *errorcodeptr = ERR38;
3033 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3034 PUT(code, LINK_SIZE, 0); /* Default length */
3035 code += 2 * LINK_SIZE;
3040 case 'P': /* Named subpattern handling */
3041 if (*(++ptr) == '<') /* Definition */
3044 uschar *slot = cd->name_table;
3045 const uschar *name; /* Don't amalgamate; some compilers */
3046 name = ++ptr; /* grumble at autoincrement in declaration */
3048 while (*ptr++ != '>');
3049 namelen = ptr - name - 1;
3051 for (i = 0; i < cd->names_found; i++)
3053 int crc = memcmp(name, slot+2, namelen);
3056 if (slot[2+namelen] == 0)
3058 if ((options & PCRE_DUPNAMES) == 0)
3060 *errorcodeptr = ERR43;
3064 else crc = -1; /* Current name is substring */
3068 memmove(slot + cd->name_entry_size, slot,
3069 (cd->names_found - i) * cd->name_entry_size);
3072 slot += cd->name_entry_size;
3075 PUT2(slot, 0, *brackets + 1);
3076 memcpy(slot + 2, name, namelen);
3077 slot[2+namelen] = 0;
3079 goto NUMBERED_GROUP;
3082 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3086 const uschar *name = ptr;
3087 uschar *slot = cd->name_table;
3089 while (*ptr != ')') ptr++;
3090 namelen = ptr - name;
3092 for (i = 0; i < cd->names_found; i++)
3094 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3095 slot += cd->name_entry_size;
3098 if (i < cd->names_found) /* Back reference */
3100 recno = GET2(slot, 0);
3102 else if ((recno = /* Forward back reference */
3103 find_named_parens(ptr, *brackets, name, namelen)) <= 0)
3105 *errorcodeptr = ERR15;
3109 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3111 /* Back reference */
3115 PUT2INC(code, 0, recno);
3116 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3117 if (recno > cd->top_backref) cd->top_backref = recno;
3121 /* Should never happen */
3124 case 'R': /* Pattern recursion */
3125 ptr++; /* Same as (?0) */
3128 /* Recursion or "subroutine" call */
3130 case '0': case '1': case '2': case '3': case '4':
3131 case '5': case '6': case '7': case '8': case '9':
3133 const uschar *called;
3135 while((digitab[*ptr] & ctype_digit) != 0)
3136 recno = recno * 10 + *ptr++ - '0';
3138 /* Come here from code above that handles a named recursion */
3144 /* Find the bracket that is being referenced. Temporarily end the
3145 regex in case it doesn't exist. */
3148 called = (recno == 0)? cd->start_code :
3149 find_bracket(cd->start_code, utf8, recno);
3152 *errorcodeptr = ERR15;
3156 /* If the subpattern is still open, this is a recursive call. We
3157 check to see if this is a left recursion that could loop for ever,
3158 and diagnose that case. */
3160 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3162 *errorcodeptr = ERR40;
3166 /* Insert the recursion/subroutine item, automatically wrapped inside
3170 PUT(code, 1, 2 + 2*LINK_SIZE);
3171 code += 1 + LINK_SIZE;
3174 PUT(code, 1, called - cd->start_code);
3175 code += 1 + LINK_SIZE;
3178 PUT(code, 1, 2 + 2*LINK_SIZE);
3179 code += 1 + LINK_SIZE;
3183 /* Character after (? not specially recognized */
3185 default: /* Option setting */
3189 while (*ptr != ')' && *ptr != ':')
3193 case '-': optset = &unset; break;
3195 case 'i': *optset |= PCRE_CASELESS; break;
3196 case 'J': *optset |= PCRE_DUPNAMES; break;
3197 case 'm': *optset |= PCRE_MULTILINE; break;
3198 case 's': *optset |= PCRE_DOTALL; break;
3199 case 'x': *optset |= PCRE_EXTENDED; break;
3200 case 'U': *optset |= PCRE_UNGREEDY; break;
3201 case 'X': *optset |= PCRE_EXTRA; break;
3205 /* Set up the changed option bits, but don't change anything yet. */
3207 newoptions = (options | set) & (~unset);
3209 /* If the options ended with ')' this is not the start of a nested
3210 group with option changes, so the options change at this level. Compile
3211 code to change the ims options if this setting actually changes any of
3212 them. We also pass the new setting back so that it can be put at the
3213 start of any following branches, and when this group ends (if we are in
3214 a group), a resetting item can be compiled.
3216 Note that if this item is right at the start of the pattern, the
3217 options will have been abstracted and made global, so there will be no
3218 change to compile. */
3222 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3225 *code++ = newoptions & PCRE_IMS;
3228 /* Change options at this level, and pass them back for use
3229 in subsequent branches. Reset the greedy defaults and the case
3230 value for firstbyte and reqbyte. */
3232 *optionsptr = options = newoptions;
3233 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3234 greedy_non_default = greedy_default ^ 1;
3235 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3237 previous = NULL; /* This item can't be repeated */
3238 continue; /* It is complete */
3241 /* If the options ended with ':' we are heading into a nested group
3242 with possible change of options. Such groups are non-capturing and are
3243 not assertions of any kind. All we need to do is skip over the ':';
3244 the newoptions value is handled below. */
3251 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3252 non-capturing and behave like (?:...) brackets */
3254 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3259 /* Else we have a referencing group; adjust the opcode. If the bracket
3260 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3261 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3266 if (++(*brackets) > EXTRACT_BASIC_MAX)
3268 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3269 code[1+LINK_SIZE] = OP_BRANUMBER;
3270 PUT2(code, 2+LINK_SIZE, *brackets);
3273 else bravalue = OP_BRA + *brackets;
3276 /* Process nested bracketed re. Assertions may not be repeated, but other
3277 kinds can be. We copy code into a non-register variable in order to be able
3278 to pass its address because some compilers complain otherwise. Pass in a
3279 new setting for the ims options if they have changed. */
3281 previous = (bravalue >= OP_ONCE)? code : NULL;
3284 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3287 newoptions, /* The complete new option state */
3288 options & PCRE_IMS, /* The previous ims option state */
3289 brackets, /* Extracting bracket count */
3290 &tempcode, /* Where to put code (updated) */
3291 &ptr, /* Input pointer (updated) */
3292 errorcodeptr, /* Where to put an error message */
3293 (bravalue == OP_ASSERTBACK ||
3294 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3295 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3296 &subfirstbyte, /* For possible first char */
3297 &subreqbyte, /* For possible last char */
3298 bcptr, /* Current branch chain */
3299 cd)) /* Tables block */
3302 /* At the end of compiling, code is still pointing to the start of the
3303 group, while tempcode has been updated to point past the end of the group
3304 and any option resetting that may follow it. The pattern pointer (ptr)
3305 is on the bracket. */
3307 /* If this is a conditional bracket, check that there are no more than
3308 two branches in the group. */
3310 else if (bravalue == OP_COND)
3319 while (*tc != OP_KET);
3323 *errorcodeptr = ERR27;
3327 /* If there is just one branch, we must not make use of its firstbyte or
3328 reqbyte, because this is equivalent to an empty second branch. */
3330 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3333 /* Handle updating of the required and first characters. Update for normal
3334 brackets of all kinds, and conditions with two branches (see code above).
3335 If the bracket is followed by a quantifier with zero repeat, we have to
3336 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3337 main loop so that they can be accessed for the back off. */
3339 zeroreqbyte = reqbyte;
3340 zerofirstbyte = firstbyte;
3341 groupsetfirstbyte = FALSE;
3343 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3345 /* If we have not yet set a firstbyte in this branch, take it from the
3346 subpattern, remembering that it was set here so that a repeat of more
3347 than one can replicate it as reqbyte if necessary. If the subpattern has
3348 no firstbyte, set "none" for the whole branch. In both cases, a zero
3349 repeat forces firstbyte to "none". */
3351 if (firstbyte == REQ_UNSET)
3353 if (subfirstbyte >= 0)
3355 firstbyte = subfirstbyte;
3356 groupsetfirstbyte = TRUE;
3358 else firstbyte = REQ_NONE;
3359 zerofirstbyte = REQ_NONE;
3362 /* If firstbyte was previously set, convert the subpattern's firstbyte
3363 into reqbyte if there wasn't one, using the vary flag that was in
3364 existence beforehand. */
3366 else if (subfirstbyte >= 0 && subreqbyte < 0)
3367 subreqbyte = subfirstbyte | tempreqvary;
3369 /* If the subpattern set a required byte (or set a first byte that isn't
3370 really the first byte - see above), set it. */
3372 if (subreqbyte >= 0) reqbyte = subreqbyte;
3375 /* For a forward assertion, we take the reqbyte, if set. This can be
3376 helpful if the pattern that follows the assertion doesn't set a different
3377 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3378 for an assertion, however because it leads to incorrect effect for patterns
3379 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3380 of a firstbyte. This is overcome by a scan at the end if there's no
3381 firstbyte, looking for an asserted first char. */
3383 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3385 /* Now update the main code pointer to the end of the group. */
3389 /* Error if hit end of pattern */
3393 *errorcodeptr = ERR14;
3398 /* Check \ for being a real metacharacter; if not, fall through and handle
3399 it as a data character at the start of a string. Escape items are checked
3400 for validity in the pre-compiling pass. */
3404 c = check_escape(&ptr, errorcodeptr, *brackets, options, FALSE);
3406 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3407 are arranged to be the negation of the corresponding OP_values. For the
3408 back references, the values are ESC_REF plus the reference number. Only
3409 back references and those types that consume a character may be repeated.
3410 We can test for values between ESC_b and ESC_Z for the latter; this may
3411 have to change if any new ones are ever created. */
3415 if (-c == ESC_Q) /* Handle start of quoted string */
3417 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3422 /* For metasequences that actually match a character, we disable the
3423 setting of a first character if it hasn't already been set. */
3425 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3426 firstbyte = REQ_NONE;
3428 /* Set values to reset to if this is followed by a zero repeat. */
3430 zerofirstbyte = firstbyte;
3431 zeroreqbyte = reqbyte;
3433 /* Back references are handled specially */
3437 int number = -c - ESC_REF;
3440 PUT2INC(code, 0, number);
3443 /* So are Unicode property matches, if supported. We know that get_ucp
3444 won't fail because it was tested in the pre-pass. */
3447 else if (-c == ESC_P || -c == ESC_p)
3451 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3453 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3459 /* For the rest, we can obtain the OP value by negating the escape
3464 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3470 /* We have a data character whose value is in c. In UTF-8 mode it may have
3471 a value > 127. We set its representation in the length/buffer, and then
3472 handle it as a data character. */
3475 if (utf8 && c > 127)
3476 mclength = _pcre_ord2utf8(c, mcbuffer);
3487 /* Handle a literal character. It is guaranteed not to be whitespace or #
3488 when the extended flag is set. If we are in UTF-8 mode, it may be a
3489 multi-byte literal character. */
3497 if (utf8 && (c & 0xc0) == 0xc0)
3499 while ((ptr[1] & 0xc0) == 0x80)
3500 mcbuffer[mclength++] = *(++ptr);
3504 /* At this point we have the character's bytes in mcbuffer, and the length
3505 in mclength. When not in UTF-8 mode, the length is always 1. */
3509 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3510 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3512 /* Set the first and required bytes appropriately. If no previous first
3513 byte, set it from this character, but revert to none on a zero repeat.
3514 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3517 if (firstbyte == REQ_UNSET)
3519 zerofirstbyte = REQ_NONE;
3520 zeroreqbyte = reqbyte;
3522 /* If the character is more than one byte long, we can set firstbyte
3523 only if it is not to be matched caselessly. */
3525 if (mclength == 1 || req_caseopt == 0)
3527 firstbyte = mcbuffer[0] | req_caseopt;
3528 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3530 else firstbyte = reqbyte = REQ_NONE;
3533 /* firstbyte was previously set; we can set reqbyte only the length is
3534 1 or the matching is caseful. */
3538 zerofirstbyte = firstbyte;
3539 zeroreqbyte = reqbyte;
3540 if (mclength == 1 || req_caseopt == 0)
3541 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3544 break; /* End of literal character handling */
3546 } /* end of big loop */
3548 /* Control never reaches here by falling through, only by a goto for all the
3549 error states. Pass back the position in the pattern so that it can be displayed
3550 to the user for diagnosing the error. */
3560 /*************************************************
3561 * Compile sequence of alternatives *
3562 *************************************************/
3564 /* On entry, ptr is pointing past the bracket character, but on return
3565 it points to the closing bracket, or vertical bar, or end of string.
3566 The code variable is pointing at the byte into which the BRA operator has been
3567 stored. If the ims options are changed at the start (for a (?ims: group) or
3568 during any branch, we need to insert an OP_OPT item at the start of every
3569 following branch to ensure they get set correctly at run time, and also pass
3570 the new options into every subsequent branch compile.
3573 options option bits, including any changes for this subpattern
3574 oldims previous settings of ims option bits
3575 brackets -> int containing the number of extracting brackets used
3576 codeptr -> the address of the current code pointer
3577 ptrptr -> the address of the current pattern pointer
3578 errorcodeptr -> pointer to error code variable
3579 lookbehind TRUE if this is a lookbehind assertion
3580 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3581 firstbyteptr place to put the first required character, or a negative number
3582 reqbyteptr place to put the last required character, or a negative number
3583 bcptr pointer to the chain of currently open branches
3584 cd points to the data block with tables pointers etc.
3586 Returns: TRUE on success
3590 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3591 const uschar **ptrptr, int *errorcodeptr, BOOL lookbehind, int skipbytes,
3592 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3594 const uschar *ptr = *ptrptr;
3595 uschar *code = *codeptr;
3596 uschar *last_branch = code;
3597 uschar *start_bracket = code;
3598 uschar *reverse_count = NULL;
3599 int firstbyte, reqbyte;
3600 int branchfirstbyte, branchreqbyte;
3606 firstbyte = reqbyte = REQ_UNSET;
3608 /* Offset is set zero to mark that this bracket is still open */
3611 code += 1 + LINK_SIZE + skipbytes;
3613 /* Loop for each alternative branch */
3617 /* Handle a change of ims options at the start of the branch */
3619 if ((options & PCRE_IMS) != oldims)
3622 *code++ = options & PCRE_IMS;
3625 /* Set up dummy OP_REVERSE if lookbehind assertion */
3629 *code++ = OP_REVERSE;
3630 reverse_count = code;
3634 /* Now compile the branch */
3636 if (!compile_branch(&options, brackets, &code, &ptr, errorcodeptr,
3637 &branchfirstbyte, &branchreqbyte, &bc, cd))
3643 /* If this is the first branch, the firstbyte and reqbyte values for the
3644 branch become the values for the regex. */
3646 if (*last_branch != OP_ALT)
3648 firstbyte = branchfirstbyte;
3649 reqbyte = branchreqbyte;
3652 /* If this is not the first branch, the first char and reqbyte have to
3653 match the values from all the previous branches, except that if the previous
3654 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
3655 REQ_VARY for the regex. */
3659 /* If we previously had a firstbyte, but it doesn't match the new branch,
3660 we have to abandon the firstbyte for the regex, but if there was previously
3661 no reqbyte, it takes on the value of the old firstbyte. */
3663 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
3665 if (reqbyte < 0) reqbyte = firstbyte;
3666 firstbyte = REQ_NONE;
3669 /* If we (now or from before) have no firstbyte, a firstbyte from the
3670 branch becomes a reqbyte if there isn't a branch reqbyte. */
3672 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
3673 branchreqbyte = branchfirstbyte;
3675 /* Now ensure that the reqbytes match */
3677 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
3679 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
3682 /* If lookbehind, check that this branch matches a fixed-length string,
3683 and put the length into the OP_REVERSE item. Temporarily mark the end of
3684 the branch with OP_END. */
3690 length = find_fixedlength(last_branch, options);
3691 DPRINTF(("fixed length = %d\n", length));
3694 *errorcodeptr = (length == -2)? ERR36 : ERR25;
3698 PUT(reverse_count, 0, length);
3701 /* Reached end of expression, either ')' or end of pattern. Go back through
3702 the alternative branches and reverse the chain of offsets, with the field in
3703 the BRA item now becoming an offset to the first alternative. If there are
3704 no alternatives, it points to the end of the group. The length in the
3705 terminating ket is always the length of the whole bracketed item. If any of
3706 the ims options were changed inside the group, compile a resetting op-code
3707 following, except at the very end of the pattern. Return leaving the pointer
3708 at the terminating char. */
3712 int length = code - last_branch;
3715 int prev_length = GET(last_branch, 1);
3716 PUT(last_branch, 1, length);
3717 length = prev_length;
3718 last_branch -= length;
3722 /* Fill in the ket */
3725 PUT(code, 1, code - start_bracket);
3726 code += 1 + LINK_SIZE;
3728 /* Resetting option if needed */
3730 if ((options & PCRE_IMS) != oldims && *ptr == ')')
3736 /* Set values to pass back */
3740 *firstbyteptr = firstbyte;
3741 *reqbyteptr = reqbyte;
3745 /* Another branch follows; insert an "or" node. Its length field points back
3746 to the previous branch while the bracket remains open. At the end the chain
3747 is reversed. It's done like this so that the start of the bracket has a
3748 zero offset until it is closed, making it possible to detect recursion. */
3751 PUT(code, 1, code - last_branch);
3752 bc.current = last_branch = code;
3753 code += 1 + LINK_SIZE;
3756 /* Control never reaches here */
3762 /*************************************************
3763 * Check for anchored expression *
3764 *************************************************/
3766 /* Try to find out if this is an anchored regular expression. Consider each
3767 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
3768 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
3769 it's anchored. However, if this is a multiline pattern, then only OP_SOD
3770 counts, since OP_CIRC can match in the middle.
3772 We can also consider a regex to be anchored if OP_SOM starts all its branches.
3773 This is the code for \G, which means "match at start of match position, taking
3774 into account the match offset".
3776 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
3777 because that will try the rest of the pattern at all possible matching points,
3778 so there is no point trying again.... er ....
3780 .... except when the .* appears inside capturing parentheses, and there is a
3781 subsequent back reference to those parentheses. We haven't enough information
3782 to catch that case precisely.
3784 At first, the best we could do was to detect when .* was in capturing brackets
3785 and the highest back reference was greater than or equal to that level.
3786 However, by keeping a bitmap of the first 31 back references, we can catch some
3787 of the more common cases more precisely.
3790 code points to start of expression (the bracket)
3791 options points to the options setting
3792 bracket_map a bitmap of which brackets we are inside while testing; this
3793 handles up to substring 31; after that we just have to take
3794 the less precise approach
3795 backref_map the back reference bitmap
3797 Returns: TRUE or FALSE
3801 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
3802 unsigned int backref_map)
3805 const uschar *scode =
3806 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
3807 register int op = *scode;
3809 /* Capturing brackets */
3815 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3816 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3817 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
3820 /* Other brackets */
3822 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3824 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
3827 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
3828 are or may be referenced. */
3830 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
3831 (*options & PCRE_DOTALL) != 0)
3833 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3836 /* Check for explicit anchoring */
3838 else if (op != OP_SOD && op != OP_SOM &&
3839 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
3841 code += GET(code, 1);
3843 while (*code == OP_ALT); /* Loop for each alternative */
3849 /*************************************************
3850 * Check for starting with ^ or .* *
3851 *************************************************/
3853 /* This is called to find out if every branch starts with ^ or .* so that
3854 "first char" processing can be done to speed things up in multiline
3855 matching and for non-DOTALL patterns that start with .* (which must start at
3856 the beginning or after \n). As in the case of is_anchored() (see above), we
3857 have to take account of back references to capturing brackets that contain .*
3858 because in that case we can't make the assumption.
3861 code points to start of expression (the bracket)
3862 bracket_map a bitmap of which brackets we are inside while testing; this
3863 handles up to substring 31; after that we just have to take
3864 the less precise approach
3865 backref_map the back reference bitmap
3867 Returns: TRUE or FALSE
3871 is_startline(const uschar *code, unsigned int bracket_map,
3872 unsigned int backref_map)
3875 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
3877 register int op = *scode;
3879 /* Capturing brackets */
3885 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
3886 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
3887 if (!is_startline(scode, new_map, backref_map)) return FALSE;
3890 /* Other brackets */
3892 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
3893 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
3895 /* .* means "start at start or after \n" if it isn't in brackets that
3896 may be referenced. */
3898 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
3900 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
3903 /* Check for explicit circumflex */
3905 else if (op != OP_CIRC) return FALSE;
3907 /* Move on to the next alternative */
3909 code += GET(code, 1);
3911 while (*code == OP_ALT); /* Loop for each alternative */
3917 /*************************************************
3918 * Check for asserted fixed first char *
3919 *************************************************/
3921 /* During compilation, the "first char" settings from forward assertions are
3922 discarded, because they can cause conflicts with actual literals that follow.
3923 However, if we end up without a first char setting for an unanchored pattern,
3924 it is worth scanning the regex to see if there is an initial asserted first
3925 char. If all branches start with the same asserted char, or with a bracket all
3926 of whose alternatives start with the same asserted char (recurse ad lib), then
3927 we return that char, otherwise -1.
3930 code points to start of expression (the bracket)
3931 options pointer to the options (used to check casing changes)
3932 inassert TRUE if in an assertion
3934 Returns: -1 or the fixed first char
3938 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
3940 register int c = -1;
3943 const uschar *scode =
3944 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
3945 register int op = *scode;
3947 if (op >= OP_BRA) op = OP_BRA;
3958 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
3960 if (c < 0) c = d; else if (c != d) return -1;
3963 case OP_EXACT: /* Fall through */
3970 if (!inassert) return -1;
3974 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
3976 else if (c != scode[1]) return -1;
3980 code += GET(code, 1);
3982 while (*code == OP_ALT);
3988 /*************************************************
3989 * Compile a Regular Expression *
3990 *************************************************/
3992 /* This function takes a string and returns a pointer to a block of store
3993 holding a compiled version of the expression. The original API for this
3994 function had no error code return variable; it is retained for backwards
3995 compatibility. The new function is given a new name.
3998 pattern the regular expression
3999 options various option bits
4000 errorcodeptr pointer to error code variable (pcre_compile2() only)
4001 can be NULL if you don't want a code value
4002 errorptr pointer to pointer to error text
4003 erroroffset ptr offset in pattern where error was detected
4004 tables pointer to character tables or NULL
4006 Returns: pointer to compiled data block, or NULL on error,
4007 with errorptr and erroroffset set
4010 PCRE_DATA_SCOPE pcre *
4011 pcre_compile(const char *pattern, int options, const char **errorptr,
4012 int *erroroffset, const unsigned char *tables)
4014 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
4019 PCRE_DATA_SCOPE pcre *
4020 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
4021 const char **errorptr, int *erroroffset, const unsigned char *tables)
4024 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4025 int c, firstbyte, reqbyte, newline;
4027 int branch_extra = 0;
4028 int branch_newextra;
4029 int item_count = -1;
4031 int max_name_size = 0;
4032 int lastitemlength = 0;
4038 BOOL inescq = FALSE;
4040 unsigned int brastackptr = 0;
4043 const uschar *codestart;
4045 compile_data compile_block;
4046 compile_data *cd = &compile_block;
4047 int brastack[BRASTACK_SIZE];
4048 uschar bralenstack[BRASTACK_SIZE];
4050 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4051 can do is just return NULL, but we can set a code value if there is a code
4054 if (errorptr == NULL)
4056 if (errorcodeptr != NULL) *errorcodeptr = 99;
4061 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
4063 /* However, we can give a message for this error */
4065 if (erroroffset == NULL)
4068 goto PCRE_EARLY_ERROR_RETURN;
4073 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4076 utf8 = (options & PCRE_UTF8) != 0;
4077 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4078 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
4081 goto PCRE_EARLY_ERROR_RETURN;
4084 if ((options & PCRE_UTF8) != 0)
4087 goto PCRE_EARLY_ERROR_RETURN;
4091 if ((options & ~PUBLIC_OPTIONS) != 0)
4094 goto PCRE_EARLY_ERROR_RETURN;
4097 /* Set up pointers to the individual character tables */
4099 if (tables == NULL) tables = _pcre_default_tables;
4100 cd->lcc = tables + lcc_offset;
4101 cd->fcc = tables + fcc_offset;
4102 cd->cbits = tables + cbits_offset;
4103 cd->ctypes = tables + ctypes_offset;
4105 /* Handle different types of newline. The two bits give four cases. The current
4106 code allows for one- or two-byte sequences. */
4108 switch (options & PCRE_NEWLINE_CRLF)
4110 default: newline = NEWLINE; break; /* Compile-time default */
4111 case PCRE_NEWLINE_CR: newline = '\r'; break;
4112 case PCRE_NEWLINE_LF: newline = '\n'; break;
4113 case PCRE_NEWLINE_CR+
4114 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4120 cd->nl[0] = (newline >> 8) & 255;
4121 cd->nl[1] = newline & 255;
4126 cd->nl[0] = newline;
4129 /* Maximum back reference and backref bitmap. This is updated for numeric
4130 references during the first pass, but for named references during the actual
4131 compile pass. The bitmap records up to 31 back references to help in deciding
4132 whether (.*) can be treated as anchored or not. */
4134 cd->top_backref = 0;
4135 cd->backref_map = 0;
4137 /* Reflect pattern for debugging output */
4139 DPRINTF(("------------------------------------------------------------------\n"));
4140 DPRINTF(("%s\n", pattern));
4142 /* The first thing to do is to make a pass over the pattern to compute the
4143 amount of store required to hold the compiled code. This does not have to be
4144 perfect as long as errors are overestimates. At the same time we can detect any
4145 flag settings right at the start, and extract them. Make an attempt to correct
4146 for any counted white space if an "extended" flag setting appears late in the
4147 pattern. We can't be so clever for #-comments. */
4149 ptr = (const uschar *)(pattern - 1);
4150 while ((c = *(++ptr)) != 0)
4157 /* If we are inside a \Q...\E sequence, all chars are literal */
4161 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4165 /* Otherwise, first check for ignored whitespace and comments */
4167 if ((options & PCRE_EXTENDED) != 0)
4169 if ((cd->ctypes[c] & ctype_space) != 0) continue;
4172 while (*(++ptr) != 0) if (IS_NEWLINE(ptr)) break;
4175 ptr += cd->nllen - 1;
4178 break; /* End loop at end of pattern */
4182 item_count++; /* Is zero for the first non-comment item */
4184 /* Allow space for auto callout before every item except quantifiers. */
4186 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4187 c != '*' && c != '+' && c != '?' &&
4188 (c != '{' || !is_counted_repeat(ptr + 1)))
4189 length += 2 + 2*LINK_SIZE;
4193 /* A backslashed item may be an escaped data character or it may be a
4197 c = check_escape(&ptr, &errorcode, bracount, options, FALSE);
4198 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4200 lastitemlength = 1; /* Default length of last item for repeats */
4202 if (c >= 0) /* Data character */
4204 length += 2; /* For a one-byte character */
4207 if (utf8 && c > 127)
4210 for (i = 0; i < _pcre_utf8_table1_size; i++)
4211 if (c <= _pcre_utf8_table1[i]) break;
4213 lastitemlength += i;
4220 /* If \Q, enter "literal" mode */
4228 /* \X is supported only if Unicode property support is compiled */
4234 goto PCRE_ERROR_RETURN;
4238 /* \P and \p are for Unicode properties, but only when the support has
4239 been compiled. Each item needs 3 bytes. */
4241 else if (-c == ESC_P || -c == ESC_p)
4248 if (get_ucp(&ptr, &negated, &pdata, &errorcode) < 0)
4249 goto PCRE_ERROR_RETURN;
4253 goto PCRE_ERROR_RETURN;
4257 /* Other escapes need one byte */
4261 /* A back reference needs an additional 2 bytes, plus either one or 5
4262 bytes for a repeat. We also need to keep the value of the highest
4267 int refnum = -c - ESC_REF;
4268 cd->backref_map |= (refnum < 32)? (1 << refnum) : 1;
4269 if (refnum > cd->top_backref)
4270 cd->top_backref = refnum;
4271 length += 2; /* For single back reference */
4272 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4274 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4275 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4276 if ((min == 0 && (max == 1 || max == -1)) ||
4277 (min == 1 && max == -1))
4280 if (ptr[1] == '?') ptr++;
4285 case '^': /* Single-byte metacharacters */
4292 case '*': /* These repeats won't be after brackets; */
4293 case '+': /* those are handled separately */
4296 goto POSESSIVE; /* A few lines below */
4298 /* This covers the cases of braced repeats after a single char, metachar,
4299 class, or back reference. */
4302 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4303 ptr = read_repeat_counts(ptr+1, &min, &max, &errorcode);
4304 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4306 /* These special cases just insert one extra opcode */
4308 if ((min == 0 && (max == 1 || max == -1)) ||
4309 (min == 1 && max == -1))
4312 /* These cases might insert additional copies of a preceding character. */
4318 length -= lastitemlength; /* Uncount the original char or metachar */
4319 if (min > 0) length += 3 + lastitemlength;
4321 length += lastitemlength + ((max > 0)? 3 : 1);
4324 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4326 POSESSIVE: /* Test for possessive quantifier */
4330 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4334 /* An alternation contains an offset to the next branch or ket. If any ims
4335 options changed in the previous branch(es), and/or if we are in a
4336 lookbehind assertion, extra space will be needed at the start of the
4337 branch. This is handled by branch_extra. */
4340 length += 1 + LINK_SIZE + branch_extra;
4343 /* A character class uses 33 characters provided that all the character
4344 values are less than 256. Otherwise, it uses a bit map for low valued
4345 characters, and individual items for others. Don't worry about character
4346 types that aren't allowed in classes - they'll get picked up during the
4347 compile. A character class that contains only one single-byte character
4348 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4349 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4352 if (*(++ptr) == '^')
4354 class_optcount = 10; /* Greater than one */
4357 else class_optcount = 0;
4363 /* Written as a "do" so that an initial ']' is taken as data */
4367 /* Inside \Q...\E everything is literal except \E */
4371 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4377 /* Outside \Q...\E, check for escapes */
4381 c = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4382 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4384 /* \b is backspace inside a class; \X is literal */
4386 if (-c == ESC_b) c = '\b';
4387 else if (-c == ESC_X) c = 'X';
4389 /* \Q enters quoting mode */
4391 else if (-c == ESC_Q)
4397 /* Handle escapes that turn into characters */
4399 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4401 /* Escapes that are meta-things. The normal ones just affect the
4402 bit map, but Unicode properties require an XCLASS extended item. */
4406 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4408 if (-c == ESC_p || -c == ESC_P)
4413 length += LINK_SIZE + 2;
4421 /* Check the syntax for POSIX stuff. The bits we actually handle are
4422 checked during the real compile phase. */
4424 else if (*ptr == '[' &&
4425 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
4426 check_posix_syntax(ptr, &ptr, cd))
4429 class_optcount = 10; /* Make sure > 1 */
4432 /* Anything else increments the possible optimization count. We have to
4433 detect ranges here so that we can compute the number of extra ranges for
4434 caseless wide characters when UCP support is available. If there are wide
4435 characters, we are going to have to use an XCLASS, even for single
4448 GETCHARLEN(c, ptr, extra);
4456 /* Come here from handling \ above when it escapes to a char value */
4458 NON_SPECIAL_CHARACTER:
4464 uschar const *hyptr = ptr++;
4468 d = check_escape(&ptr, &errorcode, bracount, options, TRUE);
4469 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4470 if (-d == ESC_b) d = '\b'; /* backspace */
4471 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4473 else if (ptr[1] != 0 && ptr[1] != ']')
4480 GETCHARLEN(d, ptr, extra);
4487 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4490 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4491 127 for caseless matching, we will need to use an XCLASS. */
4495 class_optcount = 10; /* Ensure > 1 */
4499 goto PCRE_ERROR_RETURN;
4503 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4506 if (!class_utf8) /* Allow for XCLASS overhead */
4509 length += LINK_SIZE + 2;
4513 /* If we have UCP support, find out how many extra ranges are
4514 needed to map the other case of characters within this range. We
4515 have to mimic the range optimization here, because extending the
4516 range upwards might push d over a boundary that makes is use
4517 another byte in the UTF-8 representation. */
4519 if ((options & PCRE_CASELESS) != 0)
4524 while (get_othercase_range(&cc, origd, &occ, &ocd))
4526 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4528 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4529 { /* if there is overlap, */
4530 c = occ; /* noting that if occ < c */
4531 continue; /* we can't have ocd > d */
4532 } /* because a subrange is */
4533 if (ocd > d && occ <= d + 1) /* always shorter than */
4534 { /* the basic range. */
4539 /* An extra item is needed */
4541 length += 1 + _pcre_ord2utf8(occ, buffer) +
4542 ((occ == ocd)? 0 : _pcre_ord2utf8(ocd, buffer));
4545 #endif /* SUPPORT_UCP */
4547 /* The length of the (possibly extended) range */
4549 length += 1 + _pcre_ord2utf8(c, buffer) + _pcre_ord2utf8(d, buffer);
4551 #endif /* SUPPORT_UTF8 */
4555 /* We have a single character. There is nothing to be done unless we
4556 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4557 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4563 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4566 class_optcount = 10; /* Ensure > 1 */
4567 if (!class_utf8) /* Allow for XCLASS overhead */
4570 length += LINK_SIZE + 2;
4573 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4574 (1 + _pcre_ord2utf8(c, buffer));
4575 #else /* SUPPORT_UCP */
4576 length += 1 + _pcre_ord2utf8(c, buffer);
4577 #endif /* SUPPORT_UCP */
4579 #endif /* SUPPORT_UTF8 */
4583 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4585 if (*ptr == 0) /* Missing terminating ']' */
4588 goto PCRE_ERROR_RETURN;
4591 /* We can optimize when there was only one optimizable character. Repeats
4592 for positive and negated single one-byte chars are handled by the general
4593 code. Here, we handle repeats for the class opcodes. */
4595 if (class_optcount == 1) length += 3; else
4599 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4600 we also need extra for wrapping the whole thing in a sub-pattern. */
4602 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
4604 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
4605 if (errorcode != 0) goto PCRE_ERROR_RETURN;
4606 if ((min == 0 && (max == 1 || max == -1)) ||
4607 (min == 1 && max == -1))
4613 length += 2 + 2*LINK_SIZE;
4615 else if (ptr[1] == '?') ptr++;
4620 /* Brackets may be genuine groups or special things */
4623 branch_newextra = 0;
4624 bracket_length = 1 + LINK_SIZE;
4627 /* Handle special forms of bracket, which all start (? */
4636 /* Skip over comments entirely */
4639 while (*ptr != 0 && *ptr != ')') ptr++;
4643 goto PCRE_ERROR_RETURN;
4647 /* Non-referencing groups and lookaheads just move the pointer on, and
4648 then behave like a non-special bracket, except that they don't increment
4649 the count of extracting brackets. Ditto for the "once only" bracket,
4650 which is in Perl from version 5.005. */
4659 /* Named subpatterns are an extension copied from Python */
4664 /* Handle the definition of a named subpattern */
4668 const uschar *p; /* Don't amalgamate; some compilers */
4669 p = ++ptr; /* grumble at autoincrement in declaration */
4670 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4674 goto PCRE_ERROR_RETURN;
4677 if (name_count > MAX_NAME_COUNT)
4680 goto PCRE_ERROR_RETURN;
4682 if (ptr - p > max_name_size)
4684 max_name_size = (ptr - p);
4685 if (max_name_size > MAX_NAME_SIZE)
4688 goto PCRE_ERROR_RETURN;
4691 capturing = TRUE; /* Named parentheses are always capturing */
4692 break; /* Go handle capturing parentheses */
4695 /* Handle back references and recursive calls to named subpatterns */
4697 if (*ptr == '=' || *ptr == '>')
4699 length += 3 + 3*LINK_SIZE; /* Allow for the automatic "once" */
4700 while ((cd->ctypes[*(++ptr)] & ctype_word) != 0);
4704 goto PCRE_ERROR_RETURN;
4706 goto RECURSE_CHECK_QUANTIFIED;
4709 /* Unknown character after (?P */
4712 goto PCRE_ERROR_RETURN;
4714 /* (?R) specifies a recursive call to the regex, which is an extension
4715 to provide the facility which can be obtained by (?p{perl-code}) in
4716 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
4718 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
4719 the appropriate numbered brackets. This includes both recursive and
4720 non-recursive calls. (?R) is now synonymous with (?0). */
4725 case '0': case '1': case '2': case '3': case '4':
4726 case '5': case '6': case '7': case '8': case '9':
4729 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4733 goto PCRE_ERROR_RETURN;
4735 length += 3 + 3*LINK_SIZE; /* Allows for the automatic "once" */
4737 /* If this item is quantified, it will get wrapped inside brackets so
4738 as to use the code for quantified brackets. We jump down and use the
4739 code that handles this for real brackets. Come here from code for
4740 named recursions/subroutines. */
4742 RECURSE_CHECK_QUANTIFIED:
4743 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
4745 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
4746 duplength = 5 + 3 * LINK_SIZE;
4747 goto HANDLE_QUANTIFIED_BRACKETS;
4751 /* (?C) is an extension which provides "callout" - to provide a bit of
4752 the functionality of the Perl (?{...}) feature. An optional number may
4753 follow (default is zero). */
4757 while ((digitab[*(++ptr)] & ctype_digit) != 0);
4761 goto PCRE_ERROR_RETURN;
4763 length += 2 + 2*LINK_SIZE;
4766 /* Lookbehinds are in Perl from version 5.005 */
4770 if (*ptr == '=' || *ptr == '!')
4772 branch_newextra = 1 + LINK_SIZE;
4773 length += 1 + LINK_SIZE; /* For the first branch */
4777 goto PCRE_ERROR_RETURN;
4779 /* Conditionals are in Perl from version 5.005. The bracket must either
4780 be followed by a number (for bracket reference) or by an assertion
4781 group. PCRE extends this by allowing a name to reference a named group;
4782 unfortunately, previously 'R' was implemented for a recursion test.
4783 When this is compiled, we look for the named group 'R' first. At this
4784 point we just do a basic syntax check. */
4787 if ((cd->ctypes[ptr[3]] & ctype_word) != 0)
4791 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4795 goto PCRE_ERROR_RETURN;
4798 else /* An assertion must follow */
4800 ptr++; /* Can treat like ':' as far as spacing is concerned */
4801 if (ptr[2] != '?' ||
4802 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
4804 ptr += 2; /* To get right offset in message */
4806 goto PCRE_ERROR_RETURN;
4811 /* Else loop checking valid options until ) is met. Anything else is an
4812 error. If we are without any brackets, i.e. at top level, the settings
4813 act as if specified in the options, so massage the options immediately.
4814 This is for backward compatibility with Perl 5.004. */
4827 *optset |= PCRE_CASELESS;
4831 *optset |= PCRE_DUPNAMES;
4832 options |= PCRE_JCHANGED; /* Record that it changed */
4836 *optset |= PCRE_MULTILINE;
4840 *optset |= PCRE_DOTALL;
4844 *optset |= PCRE_EXTENDED;
4848 *optset |= PCRE_EXTRA;
4852 *optset |= PCRE_UNGREEDY;
4859 /* A termination by ')' indicates an options-setting-only item; if
4860 this is at the very start of the pattern (indicated by item_count
4861 being zero), we use it to set the global options. This is helpful
4862 when analyzing the pattern for first characters, etc. Otherwise
4863 nothing is done here and it is handled during the compiling
4866 We allow for more than one options setting at the start. If such
4867 settings do not change the existing options, nothing is compiled.
4868 However, we must leave space just in case something is compiled.
4869 This can happen for pathological sequences such as (?i)(?-i)
4870 because the global options will end up with -i set. The space is
4871 small and not significant. (Before I did this there was a reported
4872 bug with (?i)(?-i) in a machine-generated pattern.)
4874 [Historical note: Up to Perl 5.8, options settings at top level
4875 were always global settings, wherever they appeared in the pattern.
4876 That is, they were equivalent to an external setting. From 5.8
4877 onwards, they apply only to what follows (which is what you might
4881 if (item_count == 0)
4883 options = (options | set) & (~unset);
4884 set = unset = 0; /* To save length */
4885 item_count--; /* To allow for several */
4891 /* A termination by ':' indicates the start of a nested group with
4892 the given options set. This is again handled at compile time, but
4893 we must allow for compiled space if any of the ims options are
4894 set. We also have to allow for resetting space at the end of
4895 the group, which is why 4 is added to the length and not just 2.
4896 If there are several changes of options within the same group, this
4897 will lead to an over-estimate on the length, but this shouldn't
4898 matter very much. We also have to allow for resetting options at
4899 the start of any alternations, which we do by setting
4900 branch_newextra to 2. */
4903 if (((set|unset) & PCRE_IMS) != 0)
4906 branch_newextra = 2;
4910 /* Unrecognized option character */
4914 goto PCRE_ERROR_RETURN;
4918 /* If we hit a closing bracket, that's it - this is a freestanding
4919 option-setting. We need to ensure that branch_extra is updated if
4920 necessary. The only values branch_newextra can have here are 0 or 2.
4921 If the value is 2, then branch_extra must either be 2 or 5, depending
4922 on whether this is a lookbehind group or not. */
4927 if (branch_newextra == 2 &&
4928 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
4929 branch_extra += branch_newextra;
4933 /* If options were terminated by ':' control comes here. This is a
4934 non-capturing group with an options change. There is nothing more that
4935 needs to be done because "capturing" is already set FALSE by default;
4936 we can just fall through. */
4941 /* Ordinary parentheses, not followed by '?', are capturing unless
4942 PCRE_NO_AUTO_CAPTURE is set. */
4944 else capturing = (options & PCRE_NO_AUTO_CAPTURE) == 0;
4946 /* Capturing brackets must be counted so we can process escapes in a
4947 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
4948 an additional 3 bytes of memory per capturing bracket. */
4953 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
4956 /* Save length for computing whole length at end if there's a repeat that
4957 requires duplication of the group. Also save the current value of
4958 branch_extra, and start the new group with the new value. If non-zero, this
4959 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
4961 if (brastackptr >= sizeof(brastack)/sizeof(int))
4964 goto PCRE_ERROR_RETURN;
4967 bralenstack[brastackptr] = branch_extra;
4968 branch_extra = branch_newextra;
4970 brastack[brastackptr++] = length;
4971 length += bracket_length;
4974 /* Handle ket. Look for subsequent max/min; for certain sets of values we
4975 have to replicate this bracket up to that many times. If brastackptr is
4976 0 this is an unmatched bracket which will generate an error, but take care
4977 not to try to access brastack[-1] when computing the length and restoring
4978 the branch_extra value. */
4981 length += 1 + LINK_SIZE;
4982 if (brastackptr > 0)
4984 duplength = length - brastack[--brastackptr];
4985 branch_extra = bralenstack[brastackptr];
4986 /* This is a paranoid check to stop integer overflow later on */
4987 if (duplength > MAX_DUPLENGTH)
4990 goto PCRE_ERROR_RETURN;
4995 /* The following code is also used when a recursion such as (?3) is
4996 followed by a quantifier, because in that case, it has to be wrapped inside
4997 brackets so that the quantifier works. The value of duplength must be
4998 set before arrival. */
5000 HANDLE_QUANTIFIED_BRACKETS:
5002 /* Leave ptr at the final char; for read_repeat_counts this happens
5003 automatically; for the others we need an increment. */
5005 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5007 ptr = read_repeat_counts(ptr+2, &min, &max, &errorcode);
5008 if (errorcode != 0) goto PCRE_ERROR_RETURN;
5010 else if (c == '*') { min = 0; max = -1; ptr++; }
5011 else if (c == '+') { min = 1; max = -1; ptr++; }
5012 else if (c == '?') { min = 0; max = 1; ptr++; }
5013 else { min = 1; max = 1; }
5015 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5016 group, and if the maximum is greater than zero, we have to replicate
5017 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5023 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5026 /* When the minimum is greater than zero, we have to replicate up to
5027 minval-1 times, with no additions required in the copies. Then, if there
5028 is a limited maximum we have to replicate up to maxval-1 times allowing
5029 for a BRAZERO item before each optional copy and nesting brackets for all
5030 but one of the optional copies. */
5034 length += (min - 1) * duplength;
5035 if (max > min) /* Need this test as max=-1 means no limit */
5036 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5037 - (2 + 2*LINK_SIZE);
5040 /* Allow space for once brackets for "possessive quantifier" */
5045 length += 2 + 2*LINK_SIZE;
5049 /* Non-special character. It won't be space or # in extended mode, so it is
5050 always a genuine character. If we are in a \Q...\E sequence, check for the
5051 end; if not, we have a literal. */
5056 if (inescq && c == '\\' && ptr[1] == 'E')
5063 length += 2; /* For a one-byte character */
5064 lastitemlength = 1; /* Default length of last item for repeats */
5066 /* In UTF-8 mode, check for additional bytes. */
5069 if (utf8 && (c & 0xc0) == 0xc0)
5071 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5072 { /* because the end is marked */
5073 lastitemlength++; /* by a zero byte. */
5084 length += 2 + LINK_SIZE; /* For final KET and END */
5086 if ((options & PCRE_AUTO_CALLOUT) != 0)
5087 length += 2 + 2*LINK_SIZE; /* For final callout */
5089 if (length > MAX_PATTERN_SIZE)
5092 goto PCRE_EARLY_ERROR_RETURN;
5095 /* Compute the size of data block needed and get it, either from malloc or
5096 externally provided function. Integer overflow should no longer be possible
5097 because nowadays we limit the maximum value of name_count and max_name size. */
5099 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5100 re = (real_pcre *)(pcre_malloc)(size);
5105 goto PCRE_EARLY_ERROR_RETURN;
5108 /* Put in the magic number, and save the sizes, options, and character table
5109 pointer. NULL is used for the default character tables. The nullpad field is at
5110 the end; it's there to help in the case when a regex compiled on a system with
5111 4-byte pointers is run on another with 8-byte pointers. */
5113 re->magic_number = MAGIC_NUMBER;
5115 re->options = options;
5117 re->name_table_offset = sizeof(real_pcre);
5118 re->name_entry_size = max_name_size + 3;
5119 re->name_count = name_count;
5121 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5124 /* The starting points of the name/number translation table and of the code are
5125 passed around in the compile data block. */
5127 cd->names_found = 0;
5128 cd->name_entry_size = max_name_size + 3;
5129 cd->name_table = (uschar *)re + re->name_table_offset;
5130 codestart = cd->name_table + re->name_entry_size * re->name_count;
5131 cd->start_code = codestart;
5132 cd->start_pattern = (const uschar *)pattern;
5133 cd->req_varyopt = 0;
5134 cd->nopartial = FALSE;
5136 /* Set up a starting, non-extracting bracket, then compile the expression. On
5137 error, errorcode will be set non-zero, so we don't need to look at the result
5138 of the function here. */
5140 ptr = (const uschar *)pattern;
5141 code = (uschar *)codestart;
5144 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5145 &errorcode, FALSE, 0, &firstbyte, &reqbyte, NULL, cd);
5146 re->top_bracket = bracount;
5147 re->top_backref = cd->top_backref;
5149 if (cd->nopartial) re->options |= PCRE_NOPARTIAL;
5151 /* If not reached end of pattern on success, there's an excess bracket. */
5153 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
5155 /* Fill in the terminating state and check for disastrous overflow, but
5156 if debugging, leave the test till after things are printed out. */
5161 if (code - codestart > length) errorcode = ERR23;
5164 /* Give an error if there's back reference to a non-existent capturing
5167 if (re->top_backref > re->top_bracket) errorcode = ERR15;
5169 /* Failed to compile, or error while post-processing */
5175 *erroroffset = ptr - (const uschar *)pattern;
5176 PCRE_EARLY_ERROR_RETURN:
5177 *errorptr = error_texts[errorcode];
5178 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
5182 /* If the anchored option was not passed, set the flag if we can determine that
5183 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5184 as starting with .* when DOTALL is set).
5186 Otherwise, if we know what the first character has to be, save it, because that
5187 speeds up unanchored matches no end. If not, see if we can set the
5188 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5189 start with ^. and also when all branches start with .* for non-DOTALL matches.
5192 if ((options & PCRE_ANCHORED) == 0)
5194 int temp_options = options;
5195 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
5196 re->options |= PCRE_ANCHORED;
5200 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5201 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5203 int ch = firstbyte & 255;
5204 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5205 cd->fcc[ch] == ch)? ch : firstbyte;
5206 re->options |= PCRE_FIRSTSET;
5208 else if (is_startline(codestart, 0, cd->backref_map))
5209 re->options |= PCRE_STARTLINE;
5213 /* For an anchored pattern, we use the "required byte" only if it follows a
5214 variable length item in the regex. Remove the caseless flag for non-caseable
5218 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5220 int ch = reqbyte & 255;
5221 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5222 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5223 re->options |= PCRE_REQCHSET;
5226 /* Print out the compiled data if debugging is enabled. This is never the
5227 case when building a production library. */
5231 printf("Length = %d top_bracket = %d top_backref = %d\n",
5232 length, re->top_bracket, re->top_backref);
5234 if (re->options != 0)
5236 printf("%s%s%s%s%s%s%s%s%s\n",
5237 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5238 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5239 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5240 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5241 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5242 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5243 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5244 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5245 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5248 if ((re->options & PCRE_FIRSTSET) != 0)
5250 int ch = re->first_byte & 255;
5251 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
5253 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5254 else printf("First char = \\x%02x%s\n", ch, caseless);
5257 if ((re->options & PCRE_REQCHSET) != 0)
5259 int ch = re->req_byte & 255;
5260 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
5262 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5263 else printf("Req char = \\x%02x%s\n", ch, caseless);
5266 pcre_printint(re, stdout);
5268 /* This check is done here in the debugging case so that the code that
5269 was compiled can be seen. */
5271 if (code - codestart > length)
5274 *errorptr = error_texts[ERR23];
5275 *erroroffset = ptr - (uschar *)pattern;
5276 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
5284 /* End of pcre_compile.c */