1 /* $Cambridge: exim/src/src/pcre/pcre_compile.c,v 1.6 2007/11/12 13:02:19 nm4 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2007 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains the external function pcre_compile(), along with
44 supporting internal functions that are not used by other modules. */
51 #define NLBLOCK cd /* Block containing newline information */
52 #define PSSTART start_pattern /* Field containing processed string start */
53 #define PSEND end_pattern /* Field containing processed string end */
55 #include "pcre_internal.h"
58 /* When DEBUG is defined, we need the pcre_printint() function, which is also
59 used by pcretest. DEBUG is not defined when building a production library. */
62 #include "pcre_printint.src"
66 /* Macro for setting individual bits in class bitmaps. */
68 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
70 /* Maximum length value to check against when making sure that the integer that
71 holds the compiled pattern length does not overflow. We make it a bit less than
72 INT_MAX to allow for adding in group terminating bytes, so that we don't have
73 to check them every time. */
75 #define OFLOW_MAX (INT_MAX - 20)
78 /*************************************************
79 * Code parameters and static tables *
80 *************************************************/
82 /* This value specifies the size of stack workspace that is used during the
83 first pre-compile phase that determines how much memory is required. The regex
84 is partly compiled into this space, but the compiled parts are discarded as
85 soon as they can be, so that hopefully there will never be an overrun. The code
86 does, however, check for an overrun. The largest amount I've seen used is 218,
87 so this number is very generous.
89 The same workspace is used during the second, actual compile phase for
90 remembering forward references to groups so that they can be filled in at the
91 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
92 is 4 there is plenty of room. */
94 #define COMPILE_WORK_SIZE (4096)
97 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
98 are simple data values; negative values are for special things like \d and so
99 on. Zero means further processing is needed (for things like \x), or the escape
102 #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
103 static const short int escapes[] = {
104 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
105 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
106 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
107 -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
108 -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
109 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
110 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
111 -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
112 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
113 0, 0, -ESC_z /* x - z */
116 #else /* This is the "abnormal" table for EBCDIC systems */
117 static const short int escapes[] = {
118 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
119 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
120 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
121 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
122 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
123 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
124 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
125 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
126 /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
127 /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
128 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
129 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
130 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
131 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
132 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
133 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
134 /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
135 /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
136 /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
137 /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
138 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
139 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
140 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
145 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
146 searched linearly. Put all the names into a single string, in order to reduce
147 the number of relocations when a shared library is dynamically linked. */
149 typedef struct verbitem {
154 static const char verbnames[] =
163 static verbitem verbs[] = {
173 static int verbcount = sizeof(verbs)/sizeof(verbitem);
176 /* Tables of names of POSIX character classes and their lengths. The names are
177 now all in a single string, to reduce the number of relocations when a shared
178 library is dynamically loaded. The list of lengths is terminated by a zero
179 length entry. The first three must be alpha, lower, upper, as this is assumed
180 for handling case independence. */
182 static const char posix_names[] =
183 "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
184 "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
187 static const uschar posix_name_lengths[] = {
188 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
190 /* Table of class bit maps for each POSIX class. Each class is formed from a
191 base map, with an optional addition or removal of another map. Then, for some
192 classes, there is some additional tweaking: for [:blank:] the vertical space
193 characters are removed, and for [:alpha:] and [:alnum:] the underscore
194 character is removed. The triples in the table consist of the base map offset,
195 second map offset or -1 if no second map, and a non-negative value for map
196 addition or a negative value for map subtraction (if there are two maps). The
197 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
198 remove vertical space characters, 2 => remove underscore. */
200 static const int posix_class_maps[] = {
201 cbit_word, cbit_digit, -2, /* alpha */
202 cbit_lower, -1, 0, /* lower */
203 cbit_upper, -1, 0, /* upper */
204 cbit_word, -1, 2, /* alnum - word without underscore */
205 cbit_print, cbit_cntrl, 0, /* ascii */
206 cbit_space, -1, 1, /* blank - a GNU extension */
207 cbit_cntrl, -1, 0, /* cntrl */
208 cbit_digit, -1, 0, /* digit */
209 cbit_graph, -1, 0, /* graph */
210 cbit_print, -1, 0, /* print */
211 cbit_punct, -1, 0, /* punct */
212 cbit_space, -1, 0, /* space */
213 cbit_word, -1, 0, /* word - a Perl extension */
214 cbit_xdigit,-1, 0 /* xdigit */
218 #define STRING(a) # a
219 #define XSTRING(s) STRING(s)
221 /* The texts of compile-time error messages. These are "char *" because they
222 are passed to the outside world. Do not ever re-use any error number, because
223 they are documented. Always add a new error instead. Messages marked DEAD below
224 are no longer used. This used to be a table of strings, but in order to reduce
225 the number of relocations needed when a shared library is loaded dynamically,
226 it is now one long string. We cannot use a table of offsets, because the
227 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
228 simply count through to the one we want - this isn't a performance issue
229 because these strings are used only when there is a compilation error. */
231 static const char error_texts[] =
233 "\\ at end of pattern\0"
234 "\\c at end of pattern\0"
235 "unrecognized character follows \\\0"
236 "numbers out of order in {} quantifier\0"
238 "number too big in {} quantifier\0"
239 "missing terminating ] for character class\0"
240 "invalid escape sequence in character class\0"
241 "range out of order in character class\0"
242 "nothing to repeat\0"
244 "operand of unlimited repeat could match the empty string\0" /** DEAD **/
245 "internal error: unexpected repeat\0"
246 "unrecognized character after (?\0"
247 "POSIX named classes are supported only within a class\0"
250 "reference to non-existent subpattern\0"
251 "erroffset passed as NULL\0"
252 "unknown option bit(s) set\0"
253 "missing ) after comment\0"
254 "parentheses nested too deeply\0" /** DEAD **/
256 "regular expression is too large\0"
257 "failed to get memory\0"
258 "unmatched parentheses\0"
259 "internal error: code overflow\0"
260 "unrecognized character after (?<\0"
262 "lookbehind assertion is not fixed length\0"
263 "malformed number or name after (?(\0"
264 "conditional group contains more than two branches\0"
265 "assertion expected after (?(\0"
266 "(?R or (?[+-]digits must be followed by )\0"
268 "unknown POSIX class name\0"
269 "POSIX collating elements are not supported\0"
270 "this version of PCRE is not compiled with PCRE_UTF8 support\0"
271 "spare error\0" /** DEAD **/
272 "character value in \\x{...} sequence is too large\0"
274 "invalid condition (?(0)\0"
275 "\\C not allowed in lookbehind assertion\0"
276 "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
277 "number after (?C is > 255\0"
278 "closing ) for (?C expected\0"
280 "recursive call could loop indefinitely\0"
281 "unrecognized character after (?P\0"
282 "syntax error in subpattern name (missing terminator)\0"
283 "two named subpatterns have the same name\0"
284 "invalid UTF-8 string\0"
286 "support for \\P, \\p, and \\X has not been compiled\0"
287 "malformed \\P or \\p sequence\0"
288 "unknown property name after \\P or \\p\0"
289 "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
290 "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
292 "repeated subpattern is too long\0" /** DEAD **/
293 "octal value is greater than \\377 (not in UTF-8 mode)\0"
294 "internal error: overran compiling workspace\0"
295 "internal error: previously-checked referenced subpattern not found\0"
296 "DEFINE group contains more than one branch\0"
298 "repeating a DEFINE group is not allowed\0"
299 "inconsistent NEWLINE options\0"
300 "\\g is not followed by a braced name or an optionally braced non-zero number\0"
301 "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number\0"
302 "(*VERB) with an argument is not supported\0"
304 "(*VERB) not recognized\0"
308 /* Table to identify digits and hex digits. This is used when compiling
309 patterns. Note that the tables in chartables are dependent on the locale, and
310 may mark arbitrary characters as digits - but the PCRE compiling code expects
311 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
312 a private table here. It costs 256 bytes, but it is a lot faster than doing
313 character value tests (at least in some simple cases I timed), and in some
314 applications one wants PCRE to compile efficiently as well as match
317 For convenience, we use the same bit definitions as in chartables:
320 0x08 hexadecimal digit
322 Then we can use ctype_digit and ctype_xdigit in the code. */
324 #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
325 static const unsigned char digitab[] =
327 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
328 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
329 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
330 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
331 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
332 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
333 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
334 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
335 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
336 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
337 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
338 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
339 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
340 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
341 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
342 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
343 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
344 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
345 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
346 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
347 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
348 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
349 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
350 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
351 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
352 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
353 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
354 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
355 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
356 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
357 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
358 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
360 #else /* This is the "abnormal" case, for EBCDIC systems */
361 static const unsigned char digitab[] =
363 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
364 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
365 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
366 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
367 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
368 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
369 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
370 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
371 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
372 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
373 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
374 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
375 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
376 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
377 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
378 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
379 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
380 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
381 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
382 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
383 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
384 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
385 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
386 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
387 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
388 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
389 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
390 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
391 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
392 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
393 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
394 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
396 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
397 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
398 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
399 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
400 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
401 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
402 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
403 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
404 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
405 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
406 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
407 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
408 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
409 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
410 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
411 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
412 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
413 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
414 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
415 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
416 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
417 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
418 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
419 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
420 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
421 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
422 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
423 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
424 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
425 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
426 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
427 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
428 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
432 /* Definition to allow mutual recursion */
435 compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
436 int *, int *, branch_chain *, compile_data *, int *);
440 /*************************************************
441 * Find an error text *
442 *************************************************/
444 /* The error texts are now all in one long string, to save on relocations. As
445 some of the text is of unknown length, we can't use a table of offsets.
446 Instead, just count through the strings. This is not a performance issue
447 because it happens only when there has been a compilation error.
449 Argument: the error number
450 Returns: pointer to the error string
454 find_error_text(int n)
456 const char *s = error_texts;
457 for (; n > 0; n--) while (*s++ != 0);
462 /*************************************************
464 *************************************************/
466 /* This function is called when a \ has been encountered. It either returns a
467 positive value for a simple escape such as \n, or a negative value which
468 encodes one of the more complicated things such as \d. A backreference to group
469 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
470 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
471 ptr is pointing at the \. On exit, it is on the final character of the escape
475 ptrptr points to the pattern position pointer
476 errorcodeptr points to the errorcode variable
477 bracount number of previous extracting brackets
478 options the options bits
479 isclass TRUE if inside a character class
481 Returns: zero or positive => a data character
482 negative => a special escape sequence
483 on error, errorcodeptr is set
487 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
488 int options, BOOL isclass)
490 BOOL utf8 = (options & PCRE_UTF8) != 0;
491 const uschar *ptr = *ptrptr + 1;
494 GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
495 ptr--; /* Set pointer back to the last byte */
497 /* If backslash is at the end of the pattern, it's an error. */
499 if (c == 0) *errorcodeptr = ERR1;
501 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
502 a table. A non-zero result is something that can be returned immediately.
503 Otherwise further processing may be required. */
505 #ifndef EBCDIC /* ASCII coding */
506 else if (c < '0' || c > 'z') {} /* Not alphameric */
507 else if ((i = escapes[c - '0']) != 0) c = i;
509 #else /* EBCDIC coding */
510 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
511 else if ((i = escapes[c - 0x48]) != 0) c = i;
514 /* Escapes that need further processing, or are illegal. */
518 const uschar *oldptr;
519 BOOL braced, negated;
523 /* A number of Perl escapes are not handled by PCRE. We give an explicit
531 *errorcodeptr = ERR37;
534 /* \g must be followed by a number, either plain or braced. If positive, it
535 is an absolute backreference. If negative, it is a relative backreference.
536 This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
537 reference to a named group. This is part of Perl's movement towards a
538 unified syntax for back references. As this is synonymous with \k{name}, we
539 fudge it up by pretending it really was \k. */
545 for (p = ptr+2; *p != 0 && *p != '}'; p++)
546 if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
547 if (*p != 0 && *p != '}')
562 else negated = FALSE;
565 while ((digitab[ptr[1]] & ctype_digit) != 0)
566 c = c * 10 + *(++ptr) - '0';
570 *errorcodeptr = ERR61;
574 if (c == 0 || (braced && *(++ptr) != '}'))
576 *errorcodeptr = ERR57;
584 *errorcodeptr = ERR15;
587 c = bracount - (c - 1);
593 /* The handling of escape sequences consisting of a string of digits
594 starting with one that is not zero is not straightforward. By experiment,
595 the way Perl works seems to be as follows:
597 Outside a character class, the digits are read as a decimal number. If the
598 number is less than 10, or if there are that many previous extracting
599 left brackets, then it is a back reference. Otherwise, up to three octal
600 digits are read to form an escaped byte. Thus \123 is likely to be octal
601 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
602 value is greater than 377, the least significant 8 bits are taken. Inside a
603 character class, \ followed by a digit is always an octal number. */
605 case '1': case '2': case '3': case '4': case '5':
606 case '6': case '7': case '8': case '9':
612 while ((digitab[ptr[1]] & ctype_digit) != 0)
613 c = c * 10 + *(++ptr) - '0';
616 *errorcodeptr = ERR61;
619 if (c < 10 || c <= bracount)
624 ptr = oldptr; /* Put the pointer back and fall through */
627 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
628 generates a binary zero byte and treats the digit as a following literal.
629 Thus we have to pull back the pointer by one. */
631 if ((c = *ptr) >= '8')
638 /* \0 always starts an octal number, but we may drop through to here with a
639 larger first octal digit. The original code used just to take the least
640 significant 8 bits of octal numbers (I think this is what early Perls used
641 to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
642 than 3 octal digits. */
646 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
647 c = c * 8 + *(++ptr) - '0';
648 if (!utf8 && c > 255) *errorcodeptr = ERR51;
651 /* \x is complicated. \x{ddd} is a character number which can be greater
652 than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
653 treated as a data character. */
658 const uschar *pt = ptr + 2;
662 while ((digitab[*pt] & ctype_xdigit) != 0)
664 register int cc = *pt++;
665 if (c == 0 && cc == '0') continue; /* Leading zeroes */
668 #ifndef EBCDIC /* ASCII coding */
669 if (cc >= 'a') cc -= 32; /* Convert to upper case */
670 c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
671 #else /* EBCDIC coding */
672 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
673 c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
679 if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
684 /* If the sequence of hex digits does not end with '}', then we don't
685 recognize this construct; fall through to the normal \x handling. */
688 /* Read just a single-byte hex-defined char */
691 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
693 int cc; /* Some compilers don't like ++ */
694 cc = *(++ptr); /* in initializers */
695 #ifndef EBCDIC /* ASCII coding */
696 if (cc >= 'a') cc -= 32; /* Convert to upper case */
697 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
698 #else /* EBCDIC coding */
699 if (cc <= 'z') cc += 64; /* Convert to upper case */
700 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
705 /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
706 This coding is ASCII-specific, but then the whole concept of \cx is
707 ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
713 *errorcodeptr = ERR2;
717 #ifndef EBCDIC /* ASCII coding */
718 if (c >= 'a' && c <= 'z') c -= 32;
720 #else /* EBCDIC coding */
721 if (c >= 'a' && c <= 'z') c += 64;
726 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
727 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
728 for Perl compatibility, it is a literal. This code looks a bit odd, but
729 there used to be some cases other than the default, and there may be again
730 in future, so I haven't "optimized" it. */
733 if ((options & PCRE_EXTRA) != 0) switch(c)
736 *errorcodeptr = ERR3;
750 /*************************************************
752 *************************************************/
754 /* This function is called after \P or \p has been encountered, provided that
755 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
756 pointing at the P or p. On exit, it is pointing at the final character of the
760 ptrptr points to the pattern position pointer
761 negptr points to a boolean that is set TRUE for negation else FALSE
762 dptr points to an int that is set to the detailed property value
763 errorcodeptr points to the error code variable
765 Returns: type value from ucp_type_table, or -1 for an invalid type
769 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
772 const uschar *ptr = *ptrptr;
776 if (c == 0) goto ERROR_RETURN;
780 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
790 for (i = 0; i < (int)sizeof(name) - 1; i++)
793 if (c == 0) goto ERROR_RETURN;
797 if (c !='}') goto ERROR_RETURN;
801 /* Otherwise there is just one following character */
811 /* Search for a recognized property name using binary chop */
814 top = _pcre_utt_size;
818 i = (bot + top) >> 1;
819 c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
822 *dptr = _pcre_utt[i].value;
823 return _pcre_utt[i].type;
825 if (c > 0) bot = i + 1; else top = i;
828 *errorcodeptr = ERR47;
833 *errorcodeptr = ERR46;
842 /*************************************************
843 * Check for counted repeat *
844 *************************************************/
846 /* This function is called when a '{' is encountered in a place where it might
847 start a quantifier. It looks ahead to see if it really is a quantifier or not.
848 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
849 where the ddds are digits.
852 p pointer to the first char after '{'
854 Returns: TRUE or FALSE
858 is_counted_repeat(const uschar *p)
860 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
861 while ((digitab[*p] & ctype_digit) != 0) p++;
862 if (*p == '}') return TRUE;
864 if (*p++ != ',') return FALSE;
865 if (*p == '}') return TRUE;
867 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
868 while ((digitab[*p] & ctype_digit) != 0) p++;
875 /*************************************************
876 * Read repeat counts *
877 *************************************************/
879 /* Read an item of the form {n,m} and return the values. This is called only
880 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
881 so the syntax is guaranteed to be correct, but we need to check the values.
884 p pointer to first char after '{'
885 minp pointer to int for min
886 maxp pointer to int for max
887 returned as -1 if no max
888 errorcodeptr points to error code variable
890 Returns: pointer to '}' on success;
891 current ptr on error, with errorcodeptr set non-zero
894 static const uschar *
895 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
900 /* Read the minimum value and do a paranoid check: a negative value indicates
901 an integer overflow. */
903 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
904 if (min < 0 || min > 65535)
906 *errorcodeptr = ERR5;
910 /* Read the maximum value if there is one, and again do a paranoid on its size.
911 Also, max must not be less than min. */
913 if (*p == '}') max = min; else
918 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
919 if (max < 0 || max > 65535)
921 *errorcodeptr = ERR5;
926 *errorcodeptr = ERR4;
932 /* Fill in the required variables, and pass back the pointer to the terminating
942 /*************************************************
943 * Find forward referenced subpattern *
944 *************************************************/
946 /* This function scans along a pattern's text looking for capturing
947 subpatterns, and counting them. If it finds a named pattern that matches the
948 name it is given, it returns its number. Alternatively, if the name is NULL, it
949 returns when it reaches a given numbered subpattern. This is used for forward
950 references to subpatterns. We know that if (?P< is encountered, the name will
951 be terminated by '>' because that is checked in the first pass.
954 ptr current position in the pattern
955 count current count of capturing parens so far encountered
956 name name to seek, or NULL if seeking a numbered subpattern
957 lorn name length, or subpattern number if name is NULL
958 xmode TRUE if we are in /x mode
960 Returns: the number of the named subpattern, or -1 if not found
964 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
967 const uschar *thisname;
969 for (; *ptr != 0; ptr++)
973 /* Skip over backslashed characters and also entire \Q...\E */
977 if (*(++ptr) == 0) return -1;
978 if (*ptr == 'Q') for (;;)
980 while (*(++ptr) != 0 && *ptr != '\\');
981 if (*ptr == 0) return -1;
982 if (*(++ptr) == 'E') break;
987 /* Skip over character classes */
991 while (*(++ptr) != ']')
993 if (*ptr == 0) return -1;
996 if (*(++ptr) == 0) return -1;
997 if (*ptr == 'Q') for (;;)
999 while (*(++ptr) != 0 && *ptr != '\\');
1000 if (*ptr == 0) return -1;
1001 if (*(++ptr) == 'E') break;
1009 /* Skip comments in /x mode */
1011 if (xmode && *ptr == '#')
1013 while (*(++ptr) != 0 && *ptr != '\n');
1014 if (*ptr == 0) return -1;
1018 /* An opening parens must now be a real metacharacter */
1020 if (*ptr != '(') continue;
1021 if (ptr[1] != '?' && ptr[1] != '*')
1024 if (name == NULL && count == lorn) return count;
1029 if (*ptr == 'P') ptr++; /* Allow optional P */
1031 /* We have to disambiguate (?<! and (?<= from (?<name> */
1033 if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
1039 if (name == NULL && count == lorn) return count;
1041 if (term == '<') term = '>';
1043 while (*ptr != term) ptr++;
1044 if (name != NULL && lorn == ptr - thisname &&
1045 strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1054 /*************************************************
1055 * Find first significant op code *
1056 *************************************************/
1058 /* This is called by several functions that scan a compiled expression looking
1059 for a fixed first character, or an anchoring op code etc. It skips over things
1060 that do not influence this. For some calls, a change of option is important.
1061 For some calls, it makes sense to skip negative forward and all backward
1062 assertions, and also the \b assertion; for others it does not.
1065 code pointer to the start of the group
1066 options pointer to external options
1067 optbit the option bit whose changing is significant, or
1069 skipassert TRUE if certain assertions are to be skipped
1071 Returns: pointer to the first significant opcode
1074 static const uschar*
1075 first_significant_code(const uschar *code, int *options, int optbit,
1083 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1084 *options = (int)code[1];
1090 case OP_ASSERTBACK_NOT:
1091 if (!skipassert) return code;
1092 do code += GET(code, 1); while (*code == OP_ALT);
1093 code += _pcre_OP_lengths[*code];
1096 case OP_WORD_BOUNDARY:
1097 case OP_NOT_WORD_BOUNDARY:
1098 if (!skipassert) return code;
1105 code += _pcre_OP_lengths[*code];
1112 /* Control never reaches here */
1118 /*************************************************
1119 * Find the fixed length of a pattern *
1120 *************************************************/
1122 /* Scan a pattern and compute the fixed length of subject that will match it,
1123 if the length is fixed. This is needed for dealing with backward assertions.
1124 In UTF8 mode, the result is in characters rather than bytes.
1127 code points to the start of the pattern (the bracket)
1128 options the compiling options
1130 Returns: the fixed length, or -1 if there is no fixed length,
1131 or -2 if \C was encountered
1135 find_fixedlength(uschar *code, int options)
1139 register int branchlength = 0;
1140 register uschar *cc = code + 1 + LINK_SIZE;
1142 /* Scan along the opcodes for this branch. If we get to the end of the
1143 branch, check the length against that of the other branches. */
1148 register int op = *cc;
1155 d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1156 if (d < 0) return d;
1158 do cc += GET(cc, 1); while (*cc == OP_ALT);
1159 cc += 1 + LINK_SIZE;
1162 /* Reached end of a branch; if it's a ket it is the end of a nested
1163 call. If it's ALT it is an alternation in a nested call. If it is
1164 END it's the end of the outer call. All can be handled by the same code. */
1171 if (length < 0) length = branchlength;
1172 else if (length != branchlength) return -1;
1173 if (*cc != OP_ALT) return length;
1174 cc += 1 + LINK_SIZE;
1178 /* Skip over assertive subpatterns */
1183 case OP_ASSERTBACK_NOT:
1184 do cc += GET(cc, 1); while (*cc == OP_ALT);
1187 /* Skip over things that don't match chars */
1201 case OP_NOT_WORD_BOUNDARY:
1202 case OP_WORD_BOUNDARY:
1203 cc += _pcre_OP_lengths[*cc];
1206 /* Handle literal characters */
1214 if ((options & PCRE_UTF8) != 0)
1216 while ((*cc & 0xc0) == 0x80) cc++;
1221 /* Handle exact repetitions. The count is already in characters, but we
1222 need to skip over a multibyte character in UTF8 mode. */
1225 branchlength += GET2(cc,1);
1228 if ((options & PCRE_UTF8) != 0)
1230 while((*cc & 0x80) == 0x80) cc++;
1236 branchlength += GET2(cc,1);
1237 if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1241 /* Handle single-char matchers */
1250 case OP_NOT_WHITESPACE:
1252 case OP_NOT_WORDCHAR:
1259 /* The single-byte matcher isn't allowed */
1264 /* Check a class for variable quantification */
1268 cc += GET(cc, 1) - 33;
1286 if (GET2(cc,1) != GET2(cc,3)) return -1;
1287 branchlength += GET2(cc,1);
1296 /* Anything else is variable length */
1302 /* Control never gets here */
1308 /*************************************************
1309 * Scan compiled regex for numbered bracket *
1310 *************************************************/
1312 /* This little function scans through a compiled pattern until it finds a
1313 capturing bracket with the given number.
1316 code points to start of expression
1317 utf8 TRUE in UTF-8 mode
1318 number the required bracket number
1320 Returns: pointer to the opcode for the bracket, or NULL if not found
1323 static const uschar *
1324 find_bracket(const uschar *code, BOOL utf8, int number)
1328 register int c = *code;
1329 if (c == OP_END) return NULL;
1331 /* XCLASS is used for classes that cannot be represented just by a bit
1332 map. This includes negated single high-valued characters. The length in
1333 the table is zero; the actual length is stored in the compiled code. */
1335 if (c == OP_XCLASS) code += GET(code, 1);
1337 /* Handle capturing bracket */
1339 else if (c == OP_CBRA)
1341 int n = GET2(code, 1+LINK_SIZE);
1342 if (n == number) return (uschar *)code;
1343 code += _pcre_OP_lengths[c];
1346 /* Otherwise, we can get the item's length from the table, except that for
1347 repeated character types, we have to test for \p and \P, which have an extra
1348 two bytes of parameters. */
1355 case OP_TYPEMINSTAR:
1357 case OP_TYPEMINPLUS:
1359 case OP_TYPEMINQUERY:
1360 case OP_TYPEPOSSTAR:
1361 case OP_TYPEPOSPLUS:
1362 case OP_TYPEPOSQUERY:
1363 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1367 case OP_TYPEMINUPTO:
1369 case OP_TYPEPOSUPTO:
1370 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1374 /* Add in the fixed length from the table */
1376 code += _pcre_OP_lengths[c];
1378 /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1379 a multi-byte character. The length in the table is a minimum, so we have to
1380 arrange to skip the extra bytes. */
1400 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1410 /*************************************************
1411 * Scan compiled regex for recursion reference *
1412 *************************************************/
1414 /* This little function scans through a compiled pattern until it finds an
1415 instance of OP_RECURSE.
1418 code points to start of expression
1419 utf8 TRUE in UTF-8 mode
1421 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1424 static const uschar *
1425 find_recurse(const uschar *code, BOOL utf8)
1429 register int c = *code;
1430 if (c == OP_END) return NULL;
1431 if (c == OP_RECURSE) return code;
1433 /* XCLASS is used for classes that cannot be represented just by a bit
1434 map. This includes negated single high-valued characters. The length in
1435 the table is zero; the actual length is stored in the compiled code. */
1437 if (c == OP_XCLASS) code += GET(code, 1);
1439 /* Otherwise, we can get the item's length from the table, except that for
1440 repeated character types, we have to test for \p and \P, which have an extra
1441 two bytes of parameters. */
1448 case OP_TYPEMINSTAR:
1450 case OP_TYPEMINPLUS:
1452 case OP_TYPEMINQUERY:
1453 case OP_TYPEPOSSTAR:
1454 case OP_TYPEPOSPLUS:
1455 case OP_TYPEPOSQUERY:
1456 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1459 case OP_TYPEPOSUPTO:
1461 case OP_TYPEMINUPTO:
1463 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1467 /* Add in the fixed length from the table */
1469 code += _pcre_OP_lengths[c];
1471 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1472 by a multi-byte character. The length in the table is a minimum, so we have
1473 to arrange to skip the extra bytes. */
1493 if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1503 /*************************************************
1504 * Scan compiled branch for non-emptiness *
1505 *************************************************/
1507 /* This function scans through a branch of a compiled pattern to see whether it
1508 can match the empty string or not. It is called from could_be_empty()
1509 below and from compile_branch() when checking for an unlimited repeat of a
1510 group that can match nothing. Note that first_significant_code() skips over
1511 assertions. If we hit an unclosed bracket, we return "empty" - this means we've
1512 struck an inner bracket whose current branch will already have been scanned.
1515 code points to start of search
1516 endcode points to where to stop
1517 utf8 TRUE if in UTF8 mode
1519 Returns: TRUE if what is matched could be empty
1523 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1526 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1528 code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1530 const uschar *ccode;
1534 /* Groups with zero repeats can of course be empty; skip them. */
1536 if (c == OP_BRAZERO || c == OP_BRAMINZERO)
1538 code += _pcre_OP_lengths[c];
1539 do code += GET(code, 1); while (*code == OP_ALT);
1544 /* For other groups, scan the branches. */
1546 if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1549 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1551 /* Scan a closed bracket */
1553 empty_branch = FALSE;
1556 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1557 empty_branch = TRUE;
1558 code += GET(code, 1);
1560 while (*code == OP_ALT);
1561 if (!empty_branch) return FALSE; /* All branches are non-empty */
1566 /* Handle the other opcodes */
1570 /* Check for quantifiers after a class. XCLASS is used for classes that
1571 cannot be represented just by a bit map. This includes negated single
1572 high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1573 actual length is stored in the compiled code, so we must update "code"
1578 ccode = code += GET(code, 1);
1579 goto CHECK_CLASS_REPEAT;
1592 case OP_CRSTAR: /* These could be empty; continue */
1598 default: /* Non-repeat => class must match */
1599 case OP_CRPLUS: /* These repeats aren't empty */
1605 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1610 /* Opcodes that must match a character */
1617 case OP_NOT_WHITESPACE:
1619 case OP_NOT_WORDCHAR:
1635 case OP_TYPEMINPLUS:
1636 case OP_TYPEPOSPLUS:
1640 /* These are going to continue, as they may be empty, but we have to
1641 fudge the length for the \p and \P cases. */
1644 case OP_TYPEMINSTAR:
1645 case OP_TYPEPOSSTAR:
1647 case OP_TYPEMINQUERY:
1648 case OP_TYPEPOSQUERY:
1649 if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1652 /* Same for these */
1655 case OP_TYPEMINUPTO:
1656 case OP_TYPEPOSUPTO:
1657 if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1668 /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1669 MINUPTO, and POSUPTO may be followed by a multibyte character */
1681 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1692 /*************************************************
1693 * Scan compiled regex for non-emptiness *
1694 *************************************************/
1696 /* This function is called to check for left recursive calls. We want to check
1697 the current branch of the current pattern to see if it could match the empty
1698 string. If it could, we must look outwards for branches at other levels,
1699 stopping when we pass beyond the bracket which is the subject of the recursion.
1702 code points to start of the recursion
1703 endcode points to where to stop (current RECURSE item)
1704 bcptr points to the chain of current (unclosed) branch starts
1705 utf8 TRUE if in UTF-8 mode
1707 Returns: TRUE if what is matched could be empty
1711 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1714 while (bcptr != NULL && bcptr->current >= code)
1716 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1717 bcptr = bcptr->outer;
1724 /*************************************************
1725 * Check for POSIX class syntax *
1726 *************************************************/
1728 /* This function is called when the sequence "[:" or "[." or "[=" is
1729 encountered in a character class. It checks whether this is followed by an
1730 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1734 ptr pointer to the initial [
1735 endptr where to return the end pointer
1736 cd pointer to compile data
1738 Returns: TRUE or FALSE
1742 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1744 int terminator; /* Don't combine these lines; the Solaris cc */
1745 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1746 if (*(++ptr) == '^') ptr++;
1747 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1748 if (*ptr == terminator && ptr[1] == ']')
1759 /*************************************************
1760 * Check POSIX class name *
1761 *************************************************/
1763 /* This function is called to check the name given in a POSIX-style class entry
1767 ptr points to the first letter
1768 len the length of the name
1770 Returns: a value representing the name, or -1 if unknown
1774 check_posix_name(const uschar *ptr, int len)
1776 const char *pn = posix_names;
1777 register int yield = 0;
1778 while (posix_name_lengths[yield] != 0)
1780 if (len == posix_name_lengths[yield] &&
1781 strncmp((const char *)ptr, pn, len) == 0) return yield;
1782 pn += posix_name_lengths[yield] + 1;
1789 /*************************************************
1790 * Adjust OP_RECURSE items in repeated group *
1791 *************************************************/
1793 /* OP_RECURSE items contain an offset from the start of the regex to the group
1794 that is referenced. This means that groups can be replicated for fixed
1795 repetition simply by copying (because the recursion is allowed to refer to
1796 earlier groups that are outside the current group). However, when a group is
1797 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1798 it, after it has been compiled. This means that any OP_RECURSE items within it
1799 that refer to the group itself or any contained groups have to have their
1800 offsets adjusted. That one of the jobs of this function. Before it is called,
1801 the partially compiled regex must be temporarily terminated with OP_END.
1803 This function has been extended with the possibility of forward references for
1804 recursions and subroutine calls. It must also check the list of such references
1805 for the group we are dealing with. If it finds that one of the recursions in
1806 the current group is on this list, it adjusts the offset in the list, not the
1807 value in the reference (which is a group number).
1810 group points to the start of the group
1811 adjust the amount by which the group is to be moved
1812 utf8 TRUE in UTF-8 mode
1813 cd contains pointers to tables etc.
1814 save_hwm the hwm forward reference pointer at the start of the group
1820 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
1823 uschar *ptr = group;
1825 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1830 /* See if this recursion is on the forward reference list. If so, adjust the
1833 for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
1835 offset = GET(hc, 0);
1836 if (cd->start_code + offset == ptr + 1)
1838 PUT(hc, 0, offset + adjust);
1843 /* Otherwise, adjust the recursion offset if it's after the start of this
1848 offset = GET(ptr, 1);
1849 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1852 ptr += 1 + LINK_SIZE;
1858 /*************************************************
1859 * Insert an automatic callout point *
1860 *************************************************/
1862 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1863 callout points before each pattern item.
1866 code current code pointer
1867 ptr current pattern pointer
1868 cd pointers to tables etc
1870 Returns: new code pointer
1874 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1876 *code++ = OP_CALLOUT;
1878 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1879 PUT(code, LINK_SIZE, 0); /* Default length */
1880 return code + 2*LINK_SIZE;
1885 /*************************************************
1886 * Complete a callout item *
1887 *************************************************/
1889 /* A callout item contains the length of the next item in the pattern, which
1890 we can't fill in till after we have reached the relevant point. This is used
1891 for both automatic and manual callouts.
1894 previous_callout points to previous callout item
1895 ptr current pattern pointer
1896 cd pointers to tables etc
1902 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1904 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
1905 PUT(previous_callout, 2 + LINK_SIZE, length);
1911 /*************************************************
1912 * Get othercase range *
1913 *************************************************/
1915 /* This function is passed the start and end of a class range, in UTF-8 mode
1916 with UCP support. It searches up the characters, looking for internal ranges of
1917 characters in the "other" case. Each call returns the next one, updating the
1921 cptr points to starting character value; updated
1923 ocptr where to put start of othercase range
1924 odptr where to put end of othercase range
1926 Yield: TRUE when range returned; FALSE when no more
1930 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
1931 unsigned int *odptr)
1933 unsigned int c, othercase, next;
1935 for (c = *cptr; c <= d; c++)
1936 { if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR) break; }
1938 if (c > d) return FALSE;
1941 next = othercase + 1;
1943 for (++c; c <= d; c++)
1945 if (_pcre_ucp_othercase(c) != next) break;
1954 #endif /* SUPPORT_UCP */
1958 /*************************************************
1959 * Check if auto-possessifying is possible *
1960 *************************************************/
1962 /* This function is called for unlimited repeats of certain items, to see
1963 whether the next thing could possibly match the repeated item. If not, it makes
1964 sense to automatically possessify the repeated item.
1967 op_code the repeated op code
1968 this data for this item, depends on the opcode
1969 utf8 TRUE in UTF-8 mode
1970 utf8_char used for utf8 character bytes, NULL if not relevant
1971 ptr next character in pattern
1972 options options bits
1973 cd contains pointers to tables etc.
1975 Returns: TRUE if possessifying is wanted
1979 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
1980 const uschar *ptr, int options, compile_data *cd)
1984 /* Skip whitespace and comments in extended mode */
1986 if ((options & PCRE_EXTENDED) != 0)
1990 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
1993 while (*(++ptr) != 0)
1994 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2000 /* If the next item is one that we can handle, get its value. A non-negative
2001 value is a character, a negative value is an escape value. */
2005 int temperrorcode = 0;
2006 next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2007 if (temperrorcode != 0) return FALSE;
2008 ptr++; /* Point after the escape sequence */
2011 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2014 if (utf8) { GETCHARINC(next, ptr); } else
2021 /* Skip whitespace and comments in extended mode */
2023 if ((options & PCRE_EXTENDED) != 0)
2027 while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2030 while (*(++ptr) != 0)
2031 if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2037 /* If the next thing is itself optional, we have to give up. */
2039 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
2042 /* Now compare the next item with the previous opcode. If the previous is a
2043 positive single character match, "item" either contains the character or, if
2044 "item" is greater than 127 in utf8 mode, the character's bytes are in
2048 /* Handle cases when the next item is a character. */
2050 if (next >= 0) switch(op_code)
2054 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2056 return item != next;
2058 /* For CHARNC (caseless character) we must check the other case. If we have
2059 Unicode property support, we can use it to test the other case of
2060 high-valued characters. */
2064 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2066 if (item == next) return FALSE;
2070 unsigned int othercase;
2071 if (next < 128) othercase = cd->fcc[next]; else
2073 othercase = _pcre_ucp_othercase((unsigned int)next);
2075 othercase = NOTACHAR;
2077 return (unsigned int)item != othercase;
2080 #endif /* SUPPORT_UTF8 */
2081 return (item != cd->fcc[next]); /* Non-UTF-8 mode */
2083 /* For OP_NOT, "item" must be a single-byte character. */
2086 if (next < 0) return FALSE; /* Not a character */
2087 if (item == next) return TRUE;
2088 if ((options & PCRE_CASELESS) == 0) return FALSE;
2092 unsigned int othercase;
2093 if (next < 128) othercase = cd->fcc[next]; else
2095 othercase = _pcre_ucp_othercase(next);
2097 othercase = NOTACHAR;
2099 return (unsigned int)item == othercase;
2102 #endif /* SUPPORT_UTF8 */
2103 return (item == cd->fcc[next]); /* Non-UTF-8 mode */
2106 return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2109 return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2112 return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2114 case OP_NOT_WHITESPACE:
2115 return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2118 return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2120 case OP_NOT_WORDCHAR:
2121 return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2146 return op_code != OP_HSPACE;
2148 return op_code == OP_HSPACE;
2162 return op_code != OP_VSPACE;
2164 return op_code == OP_VSPACE;
2172 /* Handle the case when the next item is \d, \s, etc. */
2179 if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2184 return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2187 return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2190 return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2193 return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2196 return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2199 return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2224 return -next != ESC_h;
2226 return -next == ESC_h;
2240 return -next != ESC_v;
2242 return -next == ESC_v;
2250 return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2251 next == -ESC_h || next == -ESC_v;
2254 return next == -ESC_d;
2257 return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2259 case OP_NOT_WHITESPACE:
2260 return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2263 return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2266 return next == -ESC_h;
2268 /* Can't have \S in here because VT matches \S (Perl anomaly) */
2270 return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2273 return next == -ESC_v;
2276 return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2278 case OP_NOT_WORDCHAR:
2279 return next == -ESC_w || next == -ESC_d;
2285 /* Control does not reach here */
2290 /*************************************************
2291 * Compile one branch *
2292 *************************************************/
2294 /* Scan the pattern, compiling it into the a vector. If the options are
2295 changed during the branch, the pointer is used to change the external options
2296 bits. This function is used during the pre-compile phase when we are trying
2297 to find out the amount of memory needed, as well as during the real compile
2298 phase. The value of lengthptr distinguishes the two phases.
2301 optionsptr pointer to the option bits
2302 codeptr points to the pointer to the current code point
2303 ptrptr points to the current pattern pointer
2304 errorcodeptr points to error code variable
2305 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2306 reqbyteptr set to the last literal character required, else < 0
2307 bcptr points to current branch chain
2308 cd contains pointers to tables etc.
2309 lengthptr NULL during the real compile phase
2310 points to length accumulator during pre-compile phase
2312 Returns: TRUE on success
2313 FALSE, with *errorcodeptr set non-zero on error
2317 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2318 int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2319 compile_data *cd, int *lengthptr)
2321 int repeat_type, op_type;
2322 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2324 int greedy_default, greedy_non_default;
2325 int firstbyte, reqbyte;
2326 int zeroreqbyte, zerofirstbyte;
2327 int req_caseopt, reqvary, tempreqvary;
2328 int options = *optionsptr;
2329 int after_manual_callout = 0;
2330 int length_prevgroup = 0;
2332 register uschar *code = *codeptr;
2333 uschar *last_code = code;
2334 uschar *orig_code = code;
2336 BOOL inescq = FALSE;
2337 BOOL groupsetfirstbyte = FALSE;
2338 const uschar *ptr = *ptrptr;
2339 const uschar *tempptr;
2340 uschar *previous = NULL;
2341 uschar *previous_callout = NULL;
2342 uschar *save_hwm = NULL;
2343 uschar classbits[32];
2347 BOOL utf8 = (options & PCRE_UTF8) != 0;
2348 uschar *class_utf8data;
2349 uschar utf8_char[6];
2352 uschar *utf8_char = NULL;
2356 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2359 /* Set up the default and non-default settings for greediness */
2361 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2362 greedy_non_default = greedy_default ^ 1;
2364 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2365 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2366 matches a non-fixed char first char; reqbyte just remains unset if we never
2369 When we hit a repeat whose minimum is zero, we may have to adjust these values
2370 to take the zero repeat into account. This is implemented by setting them to
2371 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2372 item types that can be repeated set these backoff variables appropriately. */
2374 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2376 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2377 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2378 value > 255. It is added into the firstbyte or reqbyte variables to record the
2379 case status of the value. This is used only for ASCII characters. */
2381 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2383 /* Switch on next character until the end of the branch */
2388 BOOL possessive_quantifier;
2391 BOOL reset_bracount;
2392 int class_charcount;
2404 /* Get next byte in the pattern */
2408 /* If we are in the pre-compile phase, accumulate the length used for the
2409 previous cycle of this loop. */
2411 if (lengthptr != NULL)
2414 if (code > cd->hwm) cd->hwm = code; /* High water info */
2416 if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2418 *errorcodeptr = ERR52;
2422 /* There is at least one situation where code goes backwards: this is the
2423 case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2424 the class is simply eliminated. However, it is created first, so we have to
2425 allow memory for it. Therefore, don't ever reduce the length at this point.
2428 if (code < last_code) code = last_code;
2430 /* Paranoid check for integer overflow */
2432 if (OFLOW_MAX - *lengthptr < code - last_code)
2434 *errorcodeptr = ERR20;
2438 *lengthptr += code - last_code;
2439 DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2441 /* If "previous" is set and it is not at the start of the work space, move
2442 it back to there, in order to avoid filling up the work space. Otherwise,
2443 if "previous" is NULL, reset the current code pointer to the start. */
2445 if (previous != NULL)
2447 if (previous > orig_code)
2449 memmove(orig_code, previous, code - previous);
2450 code -= previous - orig_code;
2451 previous = orig_code;
2454 else code = orig_code;
2456 /* Remember where this code item starts so we can pick up the length
2462 /* In the real compile phase, just check the workspace used by the forward
2465 else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2467 *errorcodeptr = ERR52;
2471 /* If in \Q...\E, check for the end; if not, we have a literal */
2473 if (inescq && c != 0)
2475 if (c == '\\' && ptr[1] == 'E')
2483 if (previous_callout != NULL)
2485 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2486 complete_callout(previous_callout, ptr, cd);
2487 previous_callout = NULL;
2489 if ((options & PCRE_AUTO_CALLOUT) != 0)
2491 previous_callout = code;
2492 code = auto_callout(code, ptr, cd);
2498 /* Fill in length of a previous callout, except when the next thing is
2501 is_quantifier = c == '*' || c == '+' || c == '?' ||
2502 (c == '{' && is_counted_repeat(ptr+1));
2504 if (!is_quantifier && previous_callout != NULL &&
2505 after_manual_callout-- <= 0)
2507 if (lengthptr == NULL) /* Don't attempt in pre-compile phase */
2508 complete_callout(previous_callout, ptr, cd);
2509 previous_callout = NULL;
2512 /* In extended mode, skip white space and comments */
2514 if ((options & PCRE_EXTENDED) != 0)
2516 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2519 while (*(++ptr) != 0)
2521 if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2523 if (*ptr != 0) continue;
2525 /* Else fall through to handle end of string */
2530 /* No auto callout for quantifiers. */
2532 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2534 previous_callout = code;
2535 code = auto_callout(code, ptr, cd);
2540 /* ===================================================================*/
2541 case 0: /* The branch terminates at string end */
2542 case '|': /* or | or ) */
2544 *firstbyteptr = firstbyte;
2545 *reqbyteptr = reqbyte;
2548 if (lengthptr != NULL)
2550 if (OFLOW_MAX - *lengthptr < code - last_code)
2552 *errorcodeptr = ERR20;
2555 *lengthptr += code - last_code; /* To include callout length */
2556 DPRINTF((">> end branch\n"));
2561 /* ===================================================================*/
2562 /* Handle single-character metacharacters. In multiline mode, ^ disables
2563 the setting of any following char as a first character. */
2566 if ((options & PCRE_MULTILINE) != 0)
2568 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2579 /* There can never be a first char if '.' is first, whatever happens about
2580 repeats. The value of reqbyte doesn't change either. */
2583 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2584 zerofirstbyte = firstbyte;
2585 zeroreqbyte = reqbyte;
2591 /* ===================================================================*/
2592 /* Character classes. If the included characters are all < 256, we build a
2593 32-byte bitmap of the permitted characters, except in the special case
2594 where there is only one such character. For negated classes, we build the
2595 map as usual, then invert it at the end. However, we use a different opcode
2596 so that data characters > 255 can be handled correctly.
2598 If the class contains characters outside the 0-255 range, a different
2599 opcode is compiled. It may optionally have a bit map for characters < 256,
2600 but those above are are explicitly listed afterwards. A flag byte tells
2601 whether the bitmap is present, and whether this is a negated class or not.
2607 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2608 they are encountered at the top level, so we'll do that too. */
2610 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2611 check_posix_syntax(ptr, &tempptr, cd))
2613 *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
2617 /* If the first character is '^', set the negation flag and skip it. Also,
2618 if the first few characters (either before or after ^) are \Q\E or \E we
2619 skip them too. This makes for compatibility with Perl. */
2621 negate_class = FALSE;
2627 if (ptr[1] == 'E') ptr++;
2628 else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
2631 else if (!negate_class && c == '^')
2632 negate_class = TRUE;
2636 /* Keep a count of chars with values < 256 so that we can optimize the case
2637 of just a single character (as long as it's < 256). However, For higher
2638 valued UTF-8 characters, we don't yet do any optimization. */
2640 class_charcount = 0;
2641 class_lastchar = -1;
2643 /* Initialize the 32-char bit map to all zeros. We build the map in a
2644 temporary bit of memory, in case the class contains only 1 character (less
2645 than 256), because in that case the compiled code doesn't use the bit map.
2648 memset(classbits, 0, 32 * sizeof(uschar));
2651 class_utf8 = FALSE; /* No chars >= 256 */
2652 class_utf8data = code + LINK_SIZE + 2; /* For UTF-8 items */
2655 /* Process characters until ] is reached. By writing this as a "do" it
2656 means that an initial ] is taken as a data character. At the start of the
2657 loop, c contains the first byte of the character. */
2661 const uschar *oldptr;
2664 if (utf8 && c > 127)
2665 { /* Braces are required because the */
2666 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2670 /* Inside \Q...\E everything is literal except \E */
2674 if (c == '\\' && ptr[1] == 'E') /* If we are at \E */
2676 inescq = FALSE; /* Reset literal state */
2677 ptr++; /* Skip the 'E' */
2678 continue; /* Carry on with next */
2680 goto CHECK_RANGE; /* Could be range if \E follows */
2683 /* Handle POSIX class names. Perl allows a negation extension of the
2684 form [:^name:]. A square bracket that doesn't match the syntax is
2685 treated as a literal. We also recognize the POSIX constructions
2686 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2690 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2691 check_posix_syntax(ptr, &tempptr, cd))
2693 BOOL local_negate = FALSE;
2694 int posix_class, taboffset, tabopt;
2695 register const uschar *cbits = cd->cbits;
2700 *errorcodeptr = ERR31;
2707 local_negate = TRUE;
2711 posix_class = check_posix_name(ptr, tempptr - ptr);
2712 if (posix_class < 0)
2714 *errorcodeptr = ERR30;
2718 /* If matching is caseless, upper and lower are converted to
2719 alpha. This relies on the fact that the class table starts with
2720 alpha, lower, upper as the first 3 entries. */
2722 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2725 /* We build the bit map for the POSIX class in a chunk of local store
2726 because we may be adding and subtracting from it, and we don't want to
2727 subtract bits that may be in the main map already. At the end we or the
2728 result into the bit map that is being built. */
2732 /* Copy in the first table (always present) */
2734 memcpy(pbits, cbits + posix_class_maps[posix_class],
2735 32 * sizeof(uschar));
2737 /* If there is a second table, add or remove it as required. */
2739 taboffset = posix_class_maps[posix_class + 1];
2740 tabopt = posix_class_maps[posix_class + 2];
2745 for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
2747 for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
2750 /* Not see if we need to remove any special characters. An option
2751 value of 1 removes vertical space and 2 removes underscore. */
2753 if (tabopt < 0) tabopt = -tabopt;
2754 if (tabopt == 1) pbits[1] &= ~0x3c;
2755 else if (tabopt == 2) pbits[11] &= 0x7f;
2757 /* Add the POSIX table or its complement into the main table that is
2758 being built and we are done. */
2761 for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
2763 for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
2766 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2767 continue; /* End of POSIX syntax handling */
2770 /* Backslash may introduce a single character, or it may introduce one
2771 of the specials, which just set a flag. The sequence \b is a special
2772 case. Inside a class (and only there) it is treated as backspace.
2773 Elsewhere it marks a word boundary. Other escapes have preset maps ready
2774 to 'or' into the one we are building. We assume they have more than one
2775 character in them, so set class_charcount bigger than one. */
2779 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
2780 if (*errorcodeptr != 0) goto FAILED;
2782 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2783 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2784 else if (-c == ESC_R) c = 'R'; /* \R is literal R in a class */
2785 else if (-c == ESC_Q) /* Handle start of quoted string */
2787 if (ptr[1] == '\\' && ptr[2] == 'E')
2789 ptr += 2; /* avoid empty string */
2794 else if (-c == ESC_E) continue; /* Ignore orphan \E */
2798 register const uschar *cbits = cd->cbits;
2799 class_charcount += 2; /* Greater than 1 is what matters */
2801 /* Save time by not doing this in the pre-compile phase. */
2803 if (lengthptr == NULL) switch (-c)
2806 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2810 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2814 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2818 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2822 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2823 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2827 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2828 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2831 case ESC_E: /* Perl ignores an orphan \E */
2834 default: /* Not recognized; fall through */
2835 break; /* Need "default" setting to stop compiler warning. */
2838 /* In the pre-compile phase, just do the recognition. */
2840 else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
2841 c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
2843 /* We need to deal with \H, \h, \V, and \v in both phases because
2844 they use extra memory. */
2848 SETBIT(classbits, 0x09); /* VT */
2849 SETBIT(classbits, 0x20); /* SPACE */
2850 SETBIT(classbits, 0xa0); /* NSBP */
2855 *class_utf8data++ = XCL_SINGLE;
2856 class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
2857 *class_utf8data++ = XCL_SINGLE;
2858 class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
2859 *class_utf8data++ = XCL_RANGE;
2860 class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
2861 class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
2862 *class_utf8data++ = XCL_SINGLE;
2863 class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
2864 *class_utf8data++ = XCL_SINGLE;
2865 class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
2866 *class_utf8data++ = XCL_SINGLE;
2867 class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
2875 for (c = 0; c < 32; c++)
2880 case 0x09/8: x ^= 1 << (0x09%8); break;
2881 case 0x20/8: x ^= 1 << (0x20%8); break;
2882 case 0xa0/8: x ^= 1 << (0xa0%8); break;
2892 *class_utf8data++ = XCL_RANGE;
2893 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2894 class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
2895 *class_utf8data++ = XCL_RANGE;
2896 class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
2897 class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
2898 *class_utf8data++ = XCL_RANGE;
2899 class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
2900 class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
2901 *class_utf8data++ = XCL_RANGE;
2902 class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
2903 class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
2904 *class_utf8data++ = XCL_RANGE;
2905 class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
2906 class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
2907 *class_utf8data++ = XCL_RANGE;
2908 class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
2909 class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
2910 *class_utf8data++ = XCL_RANGE;
2911 class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
2912 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2920 SETBIT(classbits, 0x0a); /* LF */
2921 SETBIT(classbits, 0x0b); /* VT */
2922 SETBIT(classbits, 0x0c); /* FF */
2923 SETBIT(classbits, 0x0d); /* CR */
2924 SETBIT(classbits, 0x85); /* NEL */
2929 *class_utf8data++ = XCL_RANGE;
2930 class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
2931 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2939 for (c = 0; c < 32; c++)
2944 case 0x0a/8: x ^= 1 << (0x0a%8);
2949 case 0x85/8: x ^= 1 << (0x85%8); break;
2959 *class_utf8data++ = XCL_RANGE;
2960 class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
2961 class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
2962 *class_utf8data++ = XCL_RANGE;
2963 class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
2964 class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
2970 /* We need to deal with \P and \p in both phases. */
2973 if (-c == ESC_p || -c == ESC_P)
2977 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
2978 if (ptype < 0) goto FAILED;
2980 *class_utf8data++ = ((-c == ESC_p) != negated)?
2981 XCL_PROP : XCL_NOTPROP;
2982 *class_utf8data++ = ptype;
2983 *class_utf8data++ = pdata;
2984 class_charcount -= 2; /* Not a < 256 character */
2988 /* Unrecognized escapes are faulted if PCRE is running in its
2989 strict mode. By default, for compatibility with Perl, they are
2990 treated as literals. */
2992 if ((options & PCRE_EXTRA) != 0)
2994 *errorcodeptr = ERR7;
2998 class_charcount -= 2; /* Undo the default count from above */
2999 c = *ptr; /* Get the final character and fall through */
3002 /* Fall through if we have a single character (c >= 0). This may be
3003 greater than 256 in UTF-8 mode. */
3005 } /* End of backslash handling */
3007 /* A single character may be followed by '-' to form a range. However,
3008 Perl does not permit ']' to be the end of the range. A '-' character
3009 at the end is treated as a literal. Perl ignores orphaned \E sequences
3010 entirely. The code for handling \Q and \E is messy. */
3013 while (ptr[1] == '\\' && ptr[2] == 'E')
3021 /* Remember \r or \n */
3023 if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
3025 /* Check for range */
3027 if (!inescq && ptr[1] == '-')
3031 while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
3033 /* If we hit \Q (not followed by \E) at this point, go into escaped
3036 while (*ptr == '\\' && ptr[1] == 'Q')
3039 if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
3044 if (*ptr == 0 || (!inescq && *ptr == ']'))
3047 goto LONE_SINGLE_CHARACTER;
3052 { /* Braces are required because the */
3053 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
3057 d = *ptr; /* Not UTF-8 mode */
3059 /* The second part of a range can be a single-character escape, but
3060 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3061 in such circumstances. */
3063 if (!inescq && d == '\\')
3065 d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3066 if (*errorcodeptr != 0) goto FAILED;
3068 /* \b is backslash; \X is literal X; \R is literal R; any other
3069 special means the '-' was literal */
3073 if (d == -ESC_b) d = '\b';
3074 else if (d == -ESC_X) d = 'X';
3075 else if (d == -ESC_R) d = 'R'; else
3078 goto LONE_SINGLE_CHARACTER; /* A few lines below */
3083 /* Check that the two values are in the correct order. Optimize
3084 one-character ranges */
3088 *errorcodeptr = ERR8;
3092 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
3094 /* Remember \r or \n */
3096 if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
3098 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3099 matching, we have to use an XCLASS with extra data items. Caseless
3100 matching for characters > 127 is available only if UCP support is
3104 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3108 /* With UCP support, we can find the other case equivalents of
3109 the relevant characters. There may be several ranges. Optimize how
3110 they fit with the basic range. */
3113 if ((options & PCRE_CASELESS) != 0)
3115 unsigned int occ, ocd;
3116 unsigned int cc = c;
3117 unsigned int origd = d;
3118 while (get_othercase_range(&cc, origd, &occ, &ocd))
3120 if (occ >= (unsigned int)c &&
3121 ocd <= (unsigned int)d)
3122 continue; /* Skip embedded ranges */
3124 if (occ < (unsigned int)c &&
3125 ocd >= (unsigned int)c - 1) /* Extend the basic range */
3126 { /* if there is overlap, */
3127 c = occ; /* noting that if occ < c */
3128 continue; /* we can't have ocd > d */
3129 } /* because a subrange is */
3130 if (ocd > (unsigned int)d &&
3131 occ <= (unsigned int)d + 1) /* always shorter than */
3132 { /* the basic range. */
3139 *class_utf8data++ = XCL_SINGLE;
3143 *class_utf8data++ = XCL_RANGE;
3144 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3146 class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3149 #endif /* SUPPORT_UCP */
3151 /* Now record the original range, possibly modified for UCP caseless
3152 overlapping ranges. */
3154 *class_utf8data++ = XCL_RANGE;
3155 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3156 class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3158 /* With UCP support, we are done. Without UCP support, there is no
3159 caseless matching for UTF-8 characters > 127; we can use the bit map
3160 for the smaller ones. */
3163 continue; /* With next character in the class */
3165 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3167 /* Adjust upper limit and fall through to set up the map */
3171 #endif /* SUPPORT_UCP */
3173 #endif /* SUPPORT_UTF8 */
3175 /* We use the bit map for all cases when not in UTF-8 mode; else
3176 ranges that lie entirely within 0-127 when there is UCP support; else
3177 for partial ranges without UCP support. */
3179 class_charcount += d - c + 1;
3182 /* We can save a bit of time by skipping this in the pre-compile. */
3184 if (lengthptr == NULL) for (; c <= d; c++)
3186 classbits[c/8] |= (1 << (c&7));
3187 if ((options & PCRE_CASELESS) != 0)
3189 int uc = cd->fcc[c]; /* flip case */
3190 classbits[uc/8] |= (1 << (uc&7));
3194 continue; /* Go get the next char in the class */
3197 /* Handle a lone single character - we can get here for a normal
3198 non-escape char, or after \ that introduces a single character or for an
3199 apparent range that isn't. */
3201 LONE_SINGLE_CHARACTER:
3203 /* Handle a character that cannot go in the bit map */
3206 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3209 *class_utf8data++ = XCL_SINGLE;
3210 class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3213 if ((options & PCRE_CASELESS) != 0)
3215 unsigned int othercase;
3216 if ((othercase = _pcre_ucp_othercase(c)) != NOTACHAR)
3218 *class_utf8data++ = XCL_SINGLE;
3219 class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3222 #endif /* SUPPORT_UCP */
3226 #endif /* SUPPORT_UTF8 */
3228 /* Handle a single-byte character */
3230 classbits[c/8] |= (1 << (c&7));
3231 if ((options & PCRE_CASELESS) != 0)
3233 c = cd->fcc[c]; /* flip case */
3234 classbits[c/8] |= (1 << (c&7));
3241 /* Loop until ']' reached. This "while" is the end of the "do" above. */
3243 while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
3245 if (c == 0) /* Missing terminating ']' */
3247 *errorcodeptr = ERR6;
3252 /* This code has been disabled because it would mean that \s counts as
3253 an explicit \r or \n reference, and that's not really what is wanted. Now
3254 we set the flag only if there is a literal "\r" or "\n" in the class. */
3257 /* Remember whether \r or \n are in this class */
3261 if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3265 if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3270 /* If class_charcount is 1, we saw precisely one character whose value is
3271 less than 256. As long as there were no characters >= 128 and there was no
3272 use of \p or \P, in other words, no use of any XCLASS features, we can
3275 In UTF-8 mode, we can optimize the negative case only if there were no
3276 characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3277 operate on single-bytes only. This is an historical hangover. Maybe one day
3278 we can tidy these opcodes to handle multi-byte characters.
3280 The optimization throws away the bit map. We turn the item into a
3281 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3282 that OP_NOT does not support multibyte characters. In the positive case, it
3283 can cause firstbyte to be set. Otherwise, there can be no first char if
3284 this item is first, whatever repeat count may follow. In the case of
3285 reqbyte, save the previous value for reinstating. */
3288 if (class_charcount == 1 && !class_utf8 &&
3289 (!utf8 || !negate_class || class_lastchar < 128))
3291 if (class_charcount == 1)
3294 zeroreqbyte = reqbyte;
3296 /* The OP_NOT opcode works on one-byte characters only. */
3300 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3301 zerofirstbyte = firstbyte;
3303 *code++ = class_lastchar;
3307 /* For a single, positive character, get the value into mcbuffer, and
3308 then we can handle this with the normal one-character code. */
3311 if (utf8 && class_lastchar > 127)
3312 mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3316 mcbuffer[0] = class_lastchar;
3320 } /* End of 1-char optimization */
3322 /* The general case - not the one-char optimization. If this is the first
3323 thing in the branch, there can be no first char setting, whatever the
3324 repeat count. Any reqbyte setting must remain unchanged after any kind of
3327 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3328 zerofirstbyte = firstbyte;
3329 zeroreqbyte = reqbyte;
3331 /* If there are characters with values > 255, we have to compile an
3332 extended class, with its own opcode. If there are no characters < 256,
3333 we can omit the bitmap in the actual compiled code. */
3338 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
3339 *code++ = OP_XCLASS;
3341 *code = negate_class? XCL_NOT : 0;
3343 /* If the map is required, move up the extra data to make room for it;
3344 otherwise just move the code pointer to the end of the extra data. */
3346 if (class_charcount > 0)
3349 memmove(code + 32, code, class_utf8data - code);
3350 memcpy(code, classbits, 32);
3351 code = class_utf8data + 32;
3353 else code = class_utf8data;
3355 /* Now fill in the complete length of the item */
3357 PUT(previous, 1, code - previous);
3358 break; /* End of class handling */
3362 /* If there are no characters > 255, negate the 32-byte map if necessary,
3363 and copy it into the code vector. If this is the first thing in the branch,
3364 there can be no first char setting, whatever the repeat count. Any reqbyte
3365 setting must remain unchanged after any kind of repeat. */
3369 *code++ = OP_NCLASS;
3370 if (lengthptr == NULL) /* Save time in the pre-compile phase */
3371 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3376 memcpy(code, classbits, 32);
3382 /* ===================================================================*/
3383 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3384 has been tested above. */
3387 if (!is_quantifier) goto NORMAL_CHAR;
3388 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3389 if (*errorcodeptr != 0) goto FAILED;
3407 if (previous == NULL)
3409 *errorcodeptr = ERR9;
3413 if (repeat_min == 0)
3415 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
3416 reqbyte = zeroreqbyte; /* Ditto */
3419 /* Remember whether this is a variable length repeat */
3421 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3423 op_type = 0; /* Default single-char op codes */
3424 possessive_quantifier = FALSE; /* Default not possessive quantifier */
3426 /* Save start of previous item, in case we have to move it up to make space
3427 for an inserted OP_ONCE for the additional '+' extension. */
3429 tempcode = previous;
3431 /* If the next character is '+', we have a possessive quantifier. This
3432 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3433 If the next character is '?' this is a minimizing repeat, by default,
3434 but if PCRE_UNGREEDY is set, it works the other way round. We change the
3435 repeat type to the non-default. */
3439 repeat_type = 0; /* Force greedy */
3440 possessive_quantifier = TRUE;
3443 else if (ptr[1] == '?')
3445 repeat_type = greedy_non_default;
3448 else repeat_type = greedy_default;
3450 /* If previous was a character match, abolish the item and generate a
3451 repeat item instead. If a char item has a minumum of more than one, ensure
3452 that it is set in reqbyte - it might not be if a sequence such as x{3} is
3453 the first thing in a branch because the x will have gone into firstbyte
3456 if (*previous == OP_CHAR || *previous == OP_CHARNC)
3458 /* Deal with UTF-8 characters that take up more than one byte. It's
3459 easier to write this out separately than try to macrify it. Use c to
3460 hold the length of the character in bytes, plus 0x80 to flag that it's a
3461 length rather than a small character. */
3464 if (utf8 && (code[-1] & 0x80) != 0)
3466 uschar *lastchar = code - 1;
3467 while((*lastchar & 0xc0) == 0x80) lastchar--;
3468 c = code - lastchar; /* Length of UTF-8 character */
3469 memcpy(utf8_char, lastchar, c); /* Save the char */
3470 c |= 0x80; /* Flag c as a length */
3475 /* Handle the case of a single byte - either with no UTF8 support, or
3476 with UTF-8 disabled, or for a UTF-8 character < 128. */
3480 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3483 /* If the repetition is unlimited, it pays to see if the next thing on
3484 the line is something that cannot possibly match this character. If so,
3485 automatically possessifying this item gains some performance in the case
3486 where the match fails. */
3488 if (!possessive_quantifier &&
3490 check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3493 repeat_type = 0; /* Force greedy */
3494 possessive_quantifier = TRUE;
3497 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
3500 /* If previous was a single negated character ([^a] or similar), we use
3501 one of the special opcodes, replacing it. The code is shared with single-
3502 character repeats by setting opt_type to add a suitable offset into
3503 repeat_type. We can also test for auto-possessification. OP_NOT is
3504 currently used only for single-byte chars. */
3506 else if (*previous == OP_NOT)
3508 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
3510 if (!possessive_quantifier &&
3512 check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3514 repeat_type = 0; /* Force greedy */
3515 possessive_quantifier = TRUE;
3517 goto OUTPUT_SINGLE_REPEAT;
3520 /* If previous was a character type match (\d or similar), abolish it and
3521 create a suitable repeat item. The code is shared with single-character
3522 repeats by setting op_type to add a suitable offset into repeat_type. Note
3523 the the Unicode property types will be present only when SUPPORT_UCP is
3524 defined, but we don't wrap the little bits of code here because it just
3525 makes it horribly messy. */
3527 else if (*previous < OP_EODN)
3530 int prop_type, prop_value;
3531 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
3534 if (!possessive_quantifier &&
3536 check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3538 repeat_type = 0; /* Force greedy */
3539 possessive_quantifier = TRUE;
3542 OUTPUT_SINGLE_REPEAT:
3543 if (*previous == OP_PROP || *previous == OP_NOTPROP)
3545 prop_type = previous[1];
3546 prop_value = previous[2];
3548 else prop_type = prop_value = -1;
3551 code = previous; /* Usually overwrite previous item */
3553 /* If the maximum is zero then the minimum must also be zero; Perl allows
3554 this case, so we do too - by simply omitting the item altogether. */
3556 if (repeat_max == 0) goto END_REPEAT;
3558 /* All real repeats make it impossible to handle partial matching (maybe
3559 one day we will be able to remove this restriction). */
3561 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3563 /* Combine the op_type with the repeat_type */
3565 repeat_type += op_type;
3567 /* A minimum of zero is handled either as the special case * or ?, or as
3568 an UPTO, with the maximum given. */
3570 if (repeat_min == 0)
3572 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3573 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3576 *code++ = OP_UPTO + repeat_type;
3577 PUT2INC(code, 0, repeat_max);
3581 /* A repeat minimum of 1 is optimized into some special cases. If the
3582 maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3583 left in place and, if the maximum is greater than 1, we use OP_UPTO with
3584 one less than the maximum. */
3586 else if (repeat_min == 1)
3588 if (repeat_max == -1)
3589 *code++ = OP_PLUS + repeat_type;
3592 code = oldcode; /* leave previous item in place */
3593 if (repeat_max == 1) goto END_REPEAT;
3594 *code++ = OP_UPTO + repeat_type;
3595 PUT2INC(code, 0, repeat_max - 1);
3599 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3600 handled as an EXACT followed by an UPTO. */
3604 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3605 PUT2INC(code, 0, repeat_min);
3607 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3608 we have to insert the character for the previous code. For a repeated
3609 Unicode property match, there are two extra bytes that define the
3610 required property. In UTF-8 mode, long characters have their length in
3611 c, with the 0x80 bit as a flag. */
3616 if (utf8 && c >= 128)
3618 memcpy(code, utf8_char, c & 7);
3627 *code++ = prop_type;
3628 *code++ = prop_value;
3631 *code++ = OP_STAR + repeat_type;
3634 /* Else insert an UPTO if the max is greater than the min, again
3635 preceded by the character, for the previously inserted code. If the
3636 UPTO is just for 1 instance, we can use QUERY instead. */
3638 else if (repeat_max != repeat_min)
3641 if (utf8 && c >= 128)
3643 memcpy(code, utf8_char, c & 7);
3651 *code++ = prop_type;
3652 *code++ = prop_value;
3654 repeat_max -= repeat_min;
3656 if (repeat_max == 1)
3658 *code++ = OP_QUERY + repeat_type;
3662 *code++ = OP_UPTO + repeat_type;
3663 PUT2INC(code, 0, repeat_max);
3668 /* The character or character type itself comes last in all cases. */
3671 if (utf8 && c >= 128)
3673 memcpy(code, utf8_char, c & 7);
3680 /* For a repeated Unicode property match, there are two extra bytes that
3681 define the required property. */
3686 *code++ = prop_type;
3687 *code++ = prop_value;
3692 /* If previous was a character class or a back reference, we put the repeat
3693 stuff after it, but just skip the item if the repeat was {0,0}. */
3695 else if (*previous == OP_CLASS ||
3696 *previous == OP_NCLASS ||
3698 *previous == OP_XCLASS ||
3700 *previous == OP_REF)
3702 if (repeat_max == 0)
3708 /* All real repeats make it impossible to handle partial matching (maybe
3709 one day we will be able to remove this restriction). */
3711 if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3713 if (repeat_min == 0 && repeat_max == -1)
3714 *code++ = OP_CRSTAR + repeat_type;
3715 else if (repeat_min == 1 && repeat_max == -1)
3716 *code++ = OP_CRPLUS + repeat_type;
3717 else if (repeat_min == 0 && repeat_max == 1)
3718 *code++ = OP_CRQUERY + repeat_type;
3721 *code++ = OP_CRRANGE + repeat_type;
3722 PUT2INC(code, 0, repeat_min);
3723 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3724 PUT2INC(code, 0, repeat_max);
3728 /* If previous was a bracket group, we may have to replicate it in certain
3731 else if (*previous == OP_BRA || *previous == OP_CBRA ||
3732 *previous == OP_ONCE || *previous == OP_COND)
3736 int len = code - previous;
3737 uschar *bralink = NULL;
3739 /* Repeating a DEFINE group is pointless */
3741 if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
3743 *errorcodeptr = ERR55;
3747 /* If the maximum repeat count is unlimited, find the end of the bracket
3748 by scanning through from the start, and compute the offset back to it
3749 from the current code pointer. There may be an OP_OPT setting following
3750 the final KET, so we can't find the end just by going back from the code
3753 if (repeat_max == -1)
3755 register uschar *ket = previous;
3756 do ket += GET(ket, 1); while (*ket != OP_KET);
3757 ketoffset = code - ket;
3760 /* The case of a zero minimum is special because of the need to stick
3761 OP_BRAZERO in front of it, and because the group appears once in the
3762 data, whereas in other cases it appears the minimum number of times. For
3763 this reason, it is simplest to treat this case separately, as otherwise
3764 the code gets far too messy. There are several special subcases when the
3767 if (repeat_min == 0)
3769 /* If the maximum is also zero, we just omit the group from the output
3772 if (repeat_max == 0)
3778 /* If the maximum is 1 or unlimited, we just have to stick in the
3779 BRAZERO and do no more at this point. However, we do need to adjust
3780 any OP_RECURSE calls inside the group that refer to the group itself or
3781 any internal or forward referenced group, because the offset is from
3782 the start of the whole regex. Temporarily terminate the pattern while
3785 if (repeat_max <= 1)
3788 adjust_recurse(previous, 1, utf8, cd, save_hwm);
3789 memmove(previous+1, previous, len);
3791 *previous++ = OP_BRAZERO + repeat_type;
3794 /* If the maximum is greater than 1 and limited, we have to replicate
3795 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3796 The first one has to be handled carefully because it's the original
3797 copy, which has to be moved up. The remainder can be handled by code
3798 that is common with the non-zero minimum case below. We have to
3799 adjust the value or repeat_max, since one less copy is required. Once
3800 again, we may have to adjust any OP_RECURSE calls inside the group. */
3806 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
3807 memmove(previous + 2 + LINK_SIZE, previous, len);
3808 code += 2 + LINK_SIZE;
3809 *previous++ = OP_BRAZERO + repeat_type;
3810 *previous++ = OP_BRA;
3812 /* We chain together the bracket offset fields that have to be
3813 filled in later when the ends of the brackets are reached. */
3815 offset = (bralink == NULL)? 0 : previous - bralink;
3817 PUTINC(previous, 0, offset);
3823 /* If the minimum is greater than zero, replicate the group as many
3824 times as necessary, and adjust the maximum to the number of subsequent
3825 copies that we need. If we set a first char from the group, and didn't
3826 set a required char, copy the latter from the former. If there are any
3827 forward reference subroutine calls in the group, there will be entries on
3828 the workspace list; replicate these with an appropriate increment. */
3834 /* In the pre-compile phase, we don't actually do the replication. We
3835 just adjust the length as if we had. Do some paranoid checks for
3836 potential integer overflow. */
3838 if (lengthptr != NULL)
3840 int delta = (repeat_min - 1)*length_prevgroup;
3841 if ((double)(repeat_min - 1)*(double)length_prevgroup >
3843 OFLOW_MAX - *lengthptr < delta)
3845 *errorcodeptr = ERR20;
3848 *lengthptr += delta;
3851 /* This is compiling for real */
3855 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3856 for (i = 1; i < repeat_min; i++)
3859 uschar *this_hwm = cd->hwm;
3860 memcpy(code, previous, len);
3861 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3863 PUT(cd->hwm, 0, GET(hc, 0) + len);
3864 cd->hwm += LINK_SIZE;
3866 save_hwm = this_hwm;
3872 if (repeat_max > 0) repeat_max -= repeat_min;
3875 /* This code is common to both the zero and non-zero minimum cases. If
3876 the maximum is limited, it replicates the group in a nested fashion,
3877 remembering the bracket starts on a stack. In the case of a zero minimum,
3878 the first one was set up above. In all cases the repeat_max now specifies
3879 the number of additional copies needed. Again, we must remember to
3880 replicate entries on the forward reference list. */
3882 if (repeat_max >= 0)
3884 /* In the pre-compile phase, we don't actually do the replication. We
3885 just adjust the length as if we had. For each repetition we must add 1
3886 to the length for BRAZERO and for all but the last repetition we must
3887 add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
3888 paranoid checks to avoid integer overflow. */
3890 if (lengthptr != NULL && repeat_max > 0)
3892 int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
3893 2 - 2*LINK_SIZE; /* Last one doesn't nest */
3894 if ((double)repeat_max *
3895 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
3896 > (double)INT_MAX ||
3897 OFLOW_MAX - *lengthptr < delta)
3899 *errorcodeptr = ERR20;
3902 *lengthptr += delta;
3905 /* This is compiling for real */
3907 else for (i = repeat_max - 1; i >= 0; i--)
3910 uschar *this_hwm = cd->hwm;
3912 *code++ = OP_BRAZERO + repeat_type;
3914 /* All but the final copy start a new nesting, maintaining the
3915 chain of brackets outstanding. */
3921 offset = (bralink == NULL)? 0 : code - bralink;
3923 PUTINC(code, 0, offset);
3926 memcpy(code, previous, len);
3927 for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
3929 PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
3930 cd->hwm += LINK_SIZE;
3932 save_hwm = this_hwm;
3936 /* Now chain through the pending brackets, and fill in their length
3937 fields (which are holding the chain links pro tem). */
3939 while (bralink != NULL)
3942 int offset = code - bralink + 1;
3943 uschar *bra = code - offset;
3944 oldlinkoffset = GET(bra, 1);
3945 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3947 PUTINC(code, 0, offset);
3948 PUT(bra, 1, offset);
3952 /* If the maximum is unlimited, set a repeater in the final copy. We
3953 can't just offset backwards from the current code point, because we
3954 don't know if there's been an options resetting after the ket. The
3955 correct offset was computed above.
3957 Then, when we are doing the actual compile phase, check to see whether
3958 this group is a non-atomic one that could match an empty string. If so,
3959 convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
3960 that runtime checking can be done. [This check is also applied to
3961 atomic groups at runtime, but in a different way.] */
3965 uschar *ketcode = code - ketoffset;
3966 uschar *bracode = ketcode - GET(ketcode, 1);
3967 *ketcode = OP_KETRMAX + repeat_type;
3968 if (lengthptr == NULL && *bracode != OP_ONCE)
3970 uschar *scode = bracode;
3973 if (could_be_empty_branch(scode, ketcode, utf8))
3975 *bracode += OP_SBRA - OP_BRA;
3978 scode += GET(scode, 1);
3980 while (*scode == OP_ALT);
3985 /* Else there's some kind of shambles */
3989 *errorcodeptr = ERR11;
3993 /* If the character following a repeat is '+', or if certain optimization
3994 tests above succeeded, possessive_quantifier is TRUE. For some of the
3995 simpler opcodes, there is an special alternative opcode for this. For
3996 anything else, we wrap the entire repeated item inside OP_ONCE brackets.
3997 The '+' notation is just syntactic sugar, taken from Sun's Java package,
3998 but the special opcodes can optimize it a bit. The repeated item starts at
3999 tempcode, not at previous, which might be the first part of a string whose
4000 (former) last char we repeated.
4002 Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4003 an 'upto' may follow. We skip over an 'exact' item, and then test the
4004 length of what remains before proceeding. */
4006 if (possessive_quantifier)
4009 if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4010 *tempcode == OP_NOTEXACT)
4011 tempcode += _pcre_OP_lengths[*tempcode];
4012 len = code - tempcode;
4013 if (len > 0) switch (*tempcode)
4015 case OP_STAR: *tempcode = OP_POSSTAR; break;
4016 case OP_PLUS: *tempcode = OP_POSPLUS; break;
4017 case OP_QUERY: *tempcode = OP_POSQUERY; break;
4018 case OP_UPTO: *tempcode = OP_POSUPTO; break;
4020 case OP_TYPESTAR: *tempcode = OP_TYPEPOSSTAR; break;
4021 case OP_TYPEPLUS: *tempcode = OP_TYPEPOSPLUS; break;
4022 case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4023 case OP_TYPEUPTO: *tempcode = OP_TYPEPOSUPTO; break;
4025 case OP_NOTSTAR: *tempcode = OP_NOTPOSSTAR; break;
4026 case OP_NOTPLUS: *tempcode = OP_NOTPOSPLUS; break;
4027 case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4028 case OP_NOTUPTO: *tempcode = OP_NOTPOSUPTO; break;
4031 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4032 code += 1 + LINK_SIZE;
4033 len += 1 + LINK_SIZE;
4034 tempcode[0] = OP_ONCE;
4036 PUTINC(code, 0, len);
4037 PUT(tempcode, 1, len);
4042 /* In all case we no longer have a previous item. We also set the
4043 "follows varying string" flag for subsequently encountered reqbytes if
4044 it isn't already set and we have just passed a varying length item. */
4048 cd->req_varyopt |= reqvary;
4052 /* ===================================================================*/
4053 /* Start of nested parenthesized sub-expression, or comment or lookahead or
4054 lookbehind or option setting or condition or all the other extended
4055 parenthesis forms. */
4058 newoptions = options;
4062 reset_bracount = FALSE;
4064 /* First deal with various "verbs" that can be introduced by '*'. */
4066 if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4069 const char *vn = verbnames;
4070 const uschar *name = ++ptr;
4072 while ((cd->ctypes[*++ptr] & ctype_letter) != 0);
4075 *errorcodeptr = ERR59; /* Not supported */
4080 *errorcodeptr = ERR60;
4083 namelen = ptr - name;
4084 for (i = 0; i < verbcount; i++)
4086 if (namelen == verbs[i].len &&
4087 strncmp((char *)name, vn, namelen) == 0)
4089 *code = verbs[i].op;
4090 if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4093 vn += verbs[i].len + 1;
4095 if (i < verbcount) continue;
4096 *errorcodeptr = ERR60;
4100 /* Deal with the extended parentheses; all are introduced by '?', and the
4101 appearance of any of them means that this is not a capturing group. */
4103 else if (*ptr == '?')
4105 int i, set, unset, namelen;
4112 case '#': /* Comment; skip to ket */
4114 while (*ptr != 0 && *ptr != ')') ptr++;
4117 *errorcodeptr = ERR18;
4123 /* ------------------------------------------------------------ */
4124 case '|': /* Reset capture count for each branch */
4125 reset_bracount = TRUE;
4128 /* ------------------------------------------------------------ */
4129 case ':': /* Non-capturing bracket */
4135 /* ------------------------------------------------------------ */
4137 bravalue = OP_COND; /* Conditional group */
4139 /* A condition can be an assertion, a number (referring to a numbered
4140 group), a name (referring to a named group), or 'R', referring to
4141 recursion. R<digits> and R&name are also permitted for recursion tests.
4143 There are several syntaxes for testing a named group: (?(name)) is used
4144 by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4146 There are two unfortunate ambiguities, caused by history. (a) 'R' can
4147 be the recursive thing or the name 'R' (and similarly for 'R' followed
4148 by digits), and (b) a number could be a name that consists of digits.
4149 In both cases, we look for a name first; if not found, we try the other
4152 /* For conditions that are assertions, check the syntax, and then exit
4153 the switch. This will take control down to where bracketed groups,
4154 including assertions, are processed. */
4156 if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
4159 /* Most other conditions use OP_CREF (a couple change to OP_RREF
4160 below), and all need to skip 3 bytes at the start of the group. */
4162 code[1+LINK_SIZE] = OP_CREF;
4166 /* Check for a test for recursion in a named group. */
4168 if (ptr[1] == 'R' && ptr[2] == '&')
4172 code[1+LINK_SIZE] = OP_RREF; /* Change the type of test */
4175 /* Check for a test for a named group's having been set, using the Perl
4176 syntax (?(<name>) or (?('name') */
4178 else if (ptr[1] == '<')
4183 else if (ptr[1] == '\'')
4191 if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
4194 /* We now expect to read a name; any thing else is an error */
4196 if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4198 ptr += 1; /* To get the right offset */
4199 *errorcodeptr = ERR28;
4203 /* Read the name, but also get it as a number if it's all digits */
4207 while ((cd->ctypes[*ptr] & ctype_word) != 0)
4210 recno = ((digitab[*ptr] & ctype_digit) != 0)?
4211 recno * 10 + *ptr - '0' : -1;
4214 namelen = ptr - name;
4216 if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
4218 ptr--; /* Error offset */
4219 *errorcodeptr = ERR26;
4223 /* Do no further checking in the pre-compile phase. */
4225 if (lengthptr != NULL) break;
4227 /* In the real compile we do the work of looking for the actual
4228 reference. If the string started with "+" or "-" we require the rest to
4229 be digits, in which case recno will be set. */
4235 *errorcodeptr = ERR58;
4240 recno = cd->bracount - recno + 1;
4243 *errorcodeptr = ERR15;
4247 else recno += cd->bracount;
4248 PUT2(code, 2+LINK_SIZE, recno);
4252 /* Otherwise (did not start with "+" or "-"), start by looking for the
4255 slot = cd->name_table;
4256 for (i = 0; i < cd->names_found; i++)
4258 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4259 slot += cd->name_entry_size;
4262 /* Found a previous named subpattern */
4264 if (i < cd->names_found)
4266 recno = GET2(slot, 0);
4267 PUT2(code, 2+LINK_SIZE, recno);
4270 /* Search the pattern for a forward reference */
4272 else if ((i = find_parens(ptr, cd->bracount, name, namelen,
4273 (options & PCRE_EXTENDED) != 0)) > 0)
4275 PUT2(code, 2+LINK_SIZE, i);
4278 /* If terminator == 0 it means that the name followed directly after
4279 the opening parenthesis [e.g. (?(abc)...] and in this case there are
4280 some further alternatives to try. For the cases where terminator != 0
4281 [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4282 now checked all the possibilities, so give an error. */
4284 else if (terminator != 0)
4286 *errorcodeptr = ERR15;
4290 /* Check for (?(R) for recursion. Allow digits after R to specify a
4291 specific group number. */
4293 else if (*name == 'R')
4296 for (i = 1; i < namelen; i++)
4298 if ((digitab[name[i]] & ctype_digit) == 0)
4300 *errorcodeptr = ERR15;
4303 recno = recno * 10 + name[i] - '0';
4305 if (recno == 0) recno = RREF_ANY;
4306 code[1+LINK_SIZE] = OP_RREF; /* Change test type */
4307 PUT2(code, 2+LINK_SIZE, recno);
4310 /* Similarly, check for the (?(DEFINE) "condition", which is always
4313 else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
4315 code[1+LINK_SIZE] = OP_DEF;
4319 /* Check for the "name" actually being a subpattern number. */
4323 PUT2(code, 2+LINK_SIZE, recno);
4326 /* Either an unidentified subpattern, or a reference to (?(0) */
4330 *errorcodeptr = (recno == 0)? ERR35: ERR15;
4336 /* ------------------------------------------------------------ */
4337 case '=': /* Positive lookahead */
4338 bravalue = OP_ASSERT;
4343 /* ------------------------------------------------------------ */
4344 case '!': /* Negative lookahead */
4346 if (*ptr == ')') /* Optimize (?!) */
4352 bravalue = OP_ASSERT_NOT;
4356 /* ------------------------------------------------------------ */
4357 case '<': /* Lookbehind or named define */
4360 case '=': /* Positive lookbehind */
4361 bravalue = OP_ASSERTBACK;
4365 case '!': /* Negative lookbehind */
4366 bravalue = OP_ASSERTBACK_NOT;
4370 default: /* Could be name define, else bad */
4371 if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4372 ptr++; /* Correct offset for error */
4373 *errorcodeptr = ERR24;
4379 /* ------------------------------------------------------------ */
4380 case '>': /* One-time brackets */
4386 /* ------------------------------------------------------------ */
4387 case 'C': /* Callout - may be followed by digits; */
4388 previous_callout = code; /* Save for later completion */
4389 after_manual_callout = 1; /* Skip one item before completing */
4390 *code++ = OP_CALLOUT;
4393 while ((digitab[*(++ptr)] & ctype_digit) != 0)
4394 n = n * 10 + *ptr - '0';
4397 *errorcodeptr = ERR39;
4402 *errorcodeptr = ERR38;
4406 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
4407 PUT(code, LINK_SIZE, 0); /* Default length */
4408 code += 2 * LINK_SIZE;
4414 /* ------------------------------------------------------------ */
4415 case 'P': /* Python-style named subpattern handling */
4416 if (*(++ptr) == '=' || *ptr == '>') /* Reference or recursion */
4418 is_recurse = *ptr == '>';
4420 goto NAMED_REF_OR_RECURSE;
4422 else if (*ptr != '<') /* Test for Python-style definition */
4424 *errorcodeptr = ERR41;
4427 /* Fall through to handle (?P< as (?< is handled */
4430 /* ------------------------------------------------------------ */
4431 DEFINE_NAME: /* Come here from (?< handling */
4434 terminator = (*ptr == '<')? '>' : '\'';
4437 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4438 namelen = ptr - name;
4440 /* In the pre-compile phase, just do a syntax check. */
4442 if (lengthptr != NULL)
4444 if (*ptr != terminator)
4446 *errorcodeptr = ERR42;
4449 if (cd->names_found >= MAX_NAME_COUNT)
4451 *errorcodeptr = ERR49;
4454 if (namelen + 3 > cd->name_entry_size)
4456 cd->name_entry_size = namelen + 3;
4457 if (namelen > MAX_NAME_SIZE)
4459 *errorcodeptr = ERR48;
4465 /* In the real compile, create the entry in the table */
4469 slot = cd->name_table;
4470 for (i = 0; i < cd->names_found; i++)
4472 int crc = memcmp(name, slot+2, namelen);
4475 if (slot[2+namelen] == 0)
4477 if ((options & PCRE_DUPNAMES) == 0)
4479 *errorcodeptr = ERR43;
4483 else crc = -1; /* Current name is substring */
4487 memmove(slot + cd->name_entry_size, slot,
4488 (cd->names_found - i) * cd->name_entry_size);
4491 slot += cd->name_entry_size;
4494 PUT2(slot, 0, cd->bracount + 1);
4495 memcpy(slot + 2, name, namelen);
4496 slot[2+namelen] = 0;
4500 /* In both cases, count the number of names we've encountered. */
4502 ptr++; /* Move past > or ' */
4504 goto NUMBERED_GROUP;
4507 /* ------------------------------------------------------------ */
4508 case '&': /* Perl recursion/subroutine syntax */
4513 /* We come here from the Python syntax above that handles both
4514 references (?P=name) and recursion (?P>name), as well as falling
4515 through from the Perl recursion syntax (?&name). */
4517 NAMED_REF_OR_RECURSE:
4519 while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4520 namelen = ptr - name;
4522 /* In the pre-compile phase, do a syntax check and set a dummy
4523 reference number. */
4525 if (lengthptr != NULL)
4527 if (*ptr != terminator)
4529 *errorcodeptr = ERR42;
4532 if (namelen > MAX_NAME_SIZE)
4534 *errorcodeptr = ERR48;
4540 /* In the real compile, seek the name in the table */
4544 slot = cd->name_table;
4545 for (i = 0; i < cd->names_found; i++)
4547 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4548 slot += cd->name_entry_size;
4551 if (i < cd->names_found) /* Back reference */
4553 recno = GET2(slot, 0);
4555 else if ((recno = /* Forward back reference */
4556 find_parens(ptr, cd->bracount, name, namelen,
4557 (options & PCRE_EXTENDED) != 0)) <= 0)
4559 *errorcodeptr = ERR15;
4564 /* In both phases, we can now go to the code than handles numerical
4565 recursion or backreferences. */
4567 if (is_recurse) goto HANDLE_RECURSION;
4568 else goto HANDLE_REFERENCE;
4571 /* ------------------------------------------------------------ */
4572 case 'R': /* Recursion */
4573 ptr++; /* Same as (?0) */
4577 /* ------------------------------------------------------------ */
4579 case '0': case '1': case '2': case '3': case '4': /* Recursion or */
4580 case '5': case '6': case '7': case '8': case '9': /* subroutine */
4582 const uschar *called;
4584 if ((refsign = *ptr) == '+') ptr++;
4585 else if (refsign == '-')
4587 if ((digitab[ptr[1]] & ctype_digit) == 0)
4588 goto OTHER_CHAR_AFTER_QUERY;
4593 while((digitab[*ptr] & ctype_digit) != 0)
4594 recno = recno * 10 + *ptr++ - '0';
4598 *errorcodeptr = ERR29;
4606 *errorcodeptr = ERR58;
4609 recno = cd->bracount - recno + 1;
4612 *errorcodeptr = ERR15;
4616 else if (refsign == '+')
4620 *errorcodeptr = ERR58;
4623 recno += cd->bracount;
4626 /* Come here from code above that handles a named recursion */
4631 called = cd->start_code;
4633 /* When we are actually compiling, find the bracket that is being
4634 referenced. Temporarily end the regex in case it doesn't exist before
4635 this point. If we end up with a forward reference, first check that
4636 the bracket does occur later so we can give the error (and position)
4637 now. Then remember this forward reference in the workspace so it can
4638 be filled in at the end. */
4640 if (lengthptr == NULL)
4643 if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
4645 /* Forward reference */
4649 if (find_parens(ptr, cd->bracount, NULL, recno,
4650 (options & PCRE_EXTENDED) != 0) < 0)
4652 *errorcodeptr = ERR15;
4655 called = cd->start_code + recno;
4656 PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
4659 /* If not a forward reference, and the subpattern is still open,
4660 this is a recursive call. We check to see if this is a left
4661 recursion that could loop for ever, and diagnose that case. */
4663 else if (GET(called, 1) == 0 &&
4664 could_be_empty(called, code, bcptr, utf8))
4666 *errorcodeptr = ERR40;
4671 /* Insert the recursion/subroutine item, automatically wrapped inside
4672 "once" brackets. Set up a "previous group" length so that a
4673 subsequent quantifier will work. */
4676 PUT(code, 1, 2 + 2*LINK_SIZE);
4677 code += 1 + LINK_SIZE;
4680 PUT(code, 1, called - cd->start_code);
4681 code += 1 + LINK_SIZE;
4684 PUT(code, 1, 2 + 2*LINK_SIZE);
4685 code += 1 + LINK_SIZE;
4687 length_prevgroup = 3 + 3*LINK_SIZE;
4690 /* Can't determine a first byte now */
4692 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
4696 /* ------------------------------------------------------------ */
4697 default: /* Other characters: check option setting */
4698 OTHER_CHAR_AFTER_QUERY:
4702 while (*ptr != ')' && *ptr != ':')
4706 case '-': optset = &unset; break;
4708 case 'J': /* Record that it changed in the external options */
4709 *optset |= PCRE_DUPNAMES;
4710 cd->external_flags |= PCRE_JCHANGED;
4713 case 'i': *optset |= PCRE_CASELESS; break;
4714 case 'm': *optset |= PCRE_MULTILINE; break;
4715 case 's': *optset |= PCRE_DOTALL; break;
4716 case 'x': *optset |= PCRE_EXTENDED; break;
4717 case 'U': *optset |= PCRE_UNGREEDY; break;
4718 case 'X': *optset |= PCRE_EXTRA; break;
4720 default: *errorcodeptr = ERR12;
4721 ptr--; /* Correct the offset */
4726 /* Set up the changed option bits, but don't change anything yet. */
4728 newoptions = (options | set) & (~unset);
4730 /* If the options ended with ')' this is not the start of a nested
4731 group with option changes, so the options change at this level. If this
4732 item is right at the start of the pattern, the options can be
4733 abstracted and made external in the pre-compile phase, and ignored in
4734 the compile phase. This can be helpful when matching -- for instance in
4735 caseless checking of required bytes.
4737 If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
4738 definitely *not* at the start of the pattern because something has been
4739 compiled. In the pre-compile phase, however, the code pointer can have
4740 that value after the start, because it gets reset as code is discarded
4741 during the pre-compile. However, this can happen only at top level - if
4742 we are within parentheses, the starting BRA will still be present. At
4743 any parenthesis level, the length value can be used to test if anything
4744 has been compiled at that level. Thus, a test for both these conditions
4745 is necessary to ensure we correctly detect the start of the pattern in
4748 If we are not at the pattern start, compile code to change the ims
4749 options if this setting actually changes any of them. We also pass the
4750 new setting back so that it can be put at the start of any following
4751 branches, and when this group ends (if we are in a group), a resetting
4752 item can be compiled. */
4756 if (code == cd->start_code + 1 + LINK_SIZE &&
4757 (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
4759 cd->external_options = newoptions;
4760 options = newoptions;
4764 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
4767 *code++ = newoptions & PCRE_IMS;
4770 /* Change options at this level, and pass them back for use
4771 in subsequent branches. Reset the greedy defaults and the case
4772 value for firstbyte and reqbyte. */
4774 *optionsptr = options = newoptions;
4775 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
4776 greedy_non_default = greedy_default ^ 1;
4777 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
4780 previous = NULL; /* This item can't be repeated */
4781 continue; /* It is complete */
4784 /* If the options ended with ':' we are heading into a nested group
4785 with possible change of options. Such groups are non-capturing and are
4786 not assertions of any kind. All we need to do is skip over the ':';
4787 the newoptions value is handled below. */
4791 } /* End of switch for character following (? */
4792 } /* End of (? handling */
4794 /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
4795 all unadorned brackets become non-capturing and behave like (?:...)
4798 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
4803 /* Else we have a capturing group. */
4809 PUT2(code, 1+LINK_SIZE, cd->bracount);
4813 /* Process nested bracketed regex. Assertions may not be repeated, but
4814 other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
4815 non-register variable in order to be able to pass its address because some
4816 compilers complain otherwise. Pass in a new setting for the ims options if
4817 they have changed. */
4819 previous = (bravalue >= OP_ONCE)? code : NULL;
4822 tempreqvary = cd->req_varyopt; /* Save value before bracket */
4823 length_prevgroup = 0; /* Initialize for pre-compile phase */
4826 newoptions, /* The complete new option state */
4827 options & PCRE_IMS, /* The previous ims option state */
4828 &tempcode, /* Where to put code (updated) */
4829 &ptr, /* Input pointer (updated) */
4830 errorcodeptr, /* Where to put an error message */
4831 (bravalue == OP_ASSERTBACK ||
4832 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
4833 reset_bracount, /* True if (?| group */
4834 skipbytes, /* Skip over bracket number */
4835 &subfirstbyte, /* For possible first char */
4836 &subreqbyte, /* For possible last char */
4837 bcptr, /* Current branch chain */
4838 cd, /* Tables block */
4839 (lengthptr == NULL)? NULL : /* Actual compile phase */
4840 &length_prevgroup /* Pre-compile phase */
4844 /* At the end of compiling, code is still pointing to the start of the
4845 group, while tempcode has been updated to point past the end of the group
4846 and any option resetting that may follow it. The pattern pointer (ptr)
4847 is on the bracket. */
4849 /* If this is a conditional bracket, check that there are no more than
4850 two branches in the group, or just one if it's a DEFINE group. We do this
4851 in the real compile phase, not in the pre-pass, where the whole group may
4852 not be available. */
4854 if (bravalue == OP_COND && lengthptr == NULL)
4863 while (*tc != OP_KET);
4865 /* A DEFINE group is never obeyed inline (the "condition" is always
4866 false). It must have only one branch. */
4868 if (code[LINK_SIZE+1] == OP_DEF)
4872 *errorcodeptr = ERR54;
4875 bravalue = OP_DEF; /* Just a flag to suppress char handling below */
4878 /* A "normal" conditional group. If there is just one branch, we must not
4879 make use of its firstbyte or reqbyte, because this is equivalent to an
4880 empty second branch. */
4886 *errorcodeptr = ERR27;
4889 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
4893 /* Error if hit end of pattern */
4897 *errorcodeptr = ERR14;
4901 /* In the pre-compile phase, update the length by the length of the group,
4902 less the brackets at either end. Then reduce the compiled code to just a
4903 set of non-capturing brackets so that it doesn't use much memory if it is
4904 duplicated by a quantifier.*/
4906 if (lengthptr != NULL)
4908 if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
4910 *errorcodeptr = ERR20;
4913 *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
4915 PUTINC(code, 0, 1 + LINK_SIZE);
4917 PUTINC(code, 0, 1 + LINK_SIZE);
4918 break; /* No need to waste time with special character handling */
4921 /* Otherwise update the main code pointer to the end of the group. */
4925 /* For a DEFINE group, required and first character settings are not
4928 if (bravalue == OP_DEF) break;
4930 /* Handle updating of the required and first characters for other types of
4931 group. Update for normal brackets of all kinds, and conditions with two
4932 branches (see code above). If the bracket is followed by a quantifier with
4933 zero repeat, we have to back off. Hence the definition of zeroreqbyte and
4934 zerofirstbyte outside the main loop so that they can be accessed for the
4937 zeroreqbyte = reqbyte;
4938 zerofirstbyte = firstbyte;
4939 groupsetfirstbyte = FALSE;
4941 if (bravalue >= OP_ONCE)
4943 /* If we have not yet set a firstbyte in this branch, take it from the
4944 subpattern, remembering that it was set here so that a repeat of more
4945 than one can replicate it as reqbyte if necessary. If the subpattern has
4946 no firstbyte, set "none" for the whole branch. In both cases, a zero
4947 repeat forces firstbyte to "none". */
4949 if (firstbyte == REQ_UNSET)
4951 if (subfirstbyte >= 0)
4953 firstbyte = subfirstbyte;
4954 groupsetfirstbyte = TRUE;
4956 else firstbyte = REQ_NONE;
4957 zerofirstbyte = REQ_NONE;
4960 /* If firstbyte was previously set, convert the subpattern's firstbyte
4961 into reqbyte if there wasn't one, using the vary flag that was in
4962 existence beforehand. */
4964 else if (subfirstbyte >= 0 && subreqbyte < 0)
4965 subreqbyte = subfirstbyte | tempreqvary;
4967 /* If the subpattern set a required byte (or set a first byte that isn't
4968 really the first byte - see above), set it. */
4970 if (subreqbyte >= 0) reqbyte = subreqbyte;
4973 /* For a forward assertion, we take the reqbyte, if set. This can be
4974 helpful if the pattern that follows the assertion doesn't set a different
4975 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
4976 for an assertion, however because it leads to incorrect effect for patterns
4977 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
4978 of a firstbyte. This is overcome by a scan at the end if there's no
4979 firstbyte, looking for an asserted first char. */
4981 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
4982 break; /* End of processing '(' */
4985 /* ===================================================================*/
4986 /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
4987 are arranged to be the negation of the corresponding OP_values. For the
4988 back references, the values are ESC_REF plus the reference number. Only
4989 back references and those types that consume a character may be repeated.
4990 We can test for values between ESC_b and ESC_Z for the latter; this may
4991 have to change if any new ones are ever created. */
4995 c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
4996 if (*errorcodeptr != 0) goto FAILED;
5000 if (-c == ESC_Q) /* Handle start of quoted string */
5002 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
5007 if (-c == ESC_E) continue; /* Perl ignores an orphan \E */
5009 /* For metasequences that actually match a character, we disable the
5010 setting of a first character if it hasn't already been set. */
5012 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5013 firstbyte = REQ_NONE;
5015 /* Set values to reset to if this is followed by a zero repeat. */
5017 zerofirstbyte = firstbyte;
5018 zeroreqbyte = reqbyte;
5020 /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5021 We also support \k{name} (.NET syntax) */
5023 if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
5026 terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
5027 goto NAMED_REF_OR_RECURSE;
5030 /* Back references are handled specially; must disable firstbyte if
5031 not set to cope with cases like (?=(\w+))\1: which would otherwise set
5036 recno = -c - ESC_REF;
5038 HANDLE_REFERENCE: /* Come here from named backref handling */
5039 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5042 PUT2INC(code, 0, recno);
5043 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5044 if (recno > cd->top_backref) cd->top_backref = recno;
5047 /* So are Unicode property matches, if supported. */
5050 else if (-c == ESC_P || -c == ESC_p)
5054 int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5055 if (ptype < 0) goto FAILED;
5057 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5063 /* If Unicode properties are not supported, \X, \P, and \p are not
5066 else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5068 *errorcodeptr = ERR45;
5073 /* For the rest (including \X when Unicode properties are supported), we
5074 can obtain the OP value by negating the escape value. */
5078 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5084 /* We have a data character whose value is in c. In UTF-8 mode it may have
5085 a value > 127. We set its representation in the length/buffer, and then
5086 handle it as a data character. */
5089 if (utf8 && c > 127)
5090 mclength = _pcre_ord2utf8(c, mcbuffer);
5101 /* ===================================================================*/
5102 /* Handle a literal character. It is guaranteed not to be whitespace or #
5103 when the extended flag is set. If we are in UTF-8 mode, it may be a
5104 multi-byte literal character. */
5112 if (utf8 && c >= 0xc0)
5114 while ((ptr[1] & 0xc0) == 0x80)
5115 mcbuffer[mclength++] = *(++ptr);
5119 /* At this point we have the character's bytes in mcbuffer, and the length
5120 in mclength. When not in UTF-8 mode, the length is always 1. */
5124 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5125 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5127 /* Remember if \r or \n were seen */
5129 if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
5130 cd->external_flags |= PCRE_HASCRORLF;
5132 /* Set the first and required bytes appropriately. If no previous first
5133 byte, set it from this character, but revert to none on a zero repeat.
5134 Otherwise, leave the firstbyte value alone, and don't change it on a zero
5137 if (firstbyte == REQ_UNSET)
5139 zerofirstbyte = REQ_NONE;
5140 zeroreqbyte = reqbyte;
5142 /* If the character is more than one byte long, we can set firstbyte
5143 only if it is not to be matched caselessly. */
5145 if (mclength == 1 || req_caseopt == 0)
5147 firstbyte = mcbuffer[0] | req_caseopt;
5148 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5150 else firstbyte = reqbyte = REQ_NONE;
5153 /* firstbyte was previously set; we can set reqbyte only the length is
5154 1 or the matching is caseful. */
5158 zerofirstbyte = firstbyte;
5159 zeroreqbyte = reqbyte;
5160 if (mclength == 1 || req_caseopt == 0)
5161 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5164 break; /* End of literal character handling */
5166 } /* end of big loop */
5169 /* Control never reaches here by falling through, only by a goto for all the
5170 error states. Pass back the position in the pattern so that it can be displayed
5171 to the user for diagnosing the error. */
5181 /*************************************************
5182 * Compile sequence of alternatives *
5183 *************************************************/
5185 /* On entry, ptr is pointing past the bracket character, but on return it
5186 points to the closing bracket, or vertical bar, or end of string. The code
5187 variable is pointing at the byte into which the BRA operator has been stored.
5188 If the ims options are changed at the start (for a (?ims: group) or during any
5189 branch, we need to insert an OP_OPT item at the start of every following branch
5190 to ensure they get set correctly at run time, and also pass the new options
5191 into every subsequent branch compile.
5193 This function is used during the pre-compile phase when we are trying to find
5194 out the amount of memory needed, as well as during the real compile phase. The
5195 value of lengthptr distinguishes the two phases.
5198 options option bits, including any changes for this subpattern
5199 oldims previous settings of ims option bits
5200 codeptr -> the address of the current code pointer
5201 ptrptr -> the address of the current pattern pointer
5202 errorcodeptr -> pointer to error code variable
5203 lookbehind TRUE if this is a lookbehind assertion
5204 reset_bracount TRUE to reset the count for each branch
5205 skipbytes skip this many bytes at start (for brackets and OP_COND)
5206 firstbyteptr place to put the first required character, or a negative number
5207 reqbyteptr place to put the last required character, or a negative number
5208 bcptr pointer to the chain of currently open branches
5209 cd points to the data block with tables pointers etc.
5210 lengthptr NULL during the real compile phase
5211 points to length accumulator during pre-compile phase
5213 Returns: TRUE on success
5217 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5218 int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5219 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5222 const uschar *ptr = *ptrptr;
5223 uschar *code = *codeptr;
5224 uschar *last_branch = code;
5225 uschar *start_bracket = code;
5226 uschar *reverse_count = NULL;
5227 int firstbyte, reqbyte;
5228 int branchfirstbyte, branchreqbyte;
5237 firstbyte = reqbyte = REQ_UNSET;
5239 /* Accumulate the length for use in the pre-compile phase. Start with the
5240 length of the BRA and KET and any extra bytes that are required at the
5241 beginning. We accumulate in a local variable to save frequent testing of
5242 lenthptr for NULL. We cannot do this by looking at the value of code at the
5243 start and end of each alternative, because compiled items are discarded during
5244 the pre-compile phase so that the work space is not exceeded. */
5246 length = 2 + 2*LINK_SIZE + skipbytes;
5248 /* WARNING: If the above line is changed for any reason, you must also change
5249 the code that abstracts option settings at the start of the pattern and makes
5250 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5251 pre-compile phase to find out whether anything has yet been compiled or not. */
5253 /* Offset is set zero to mark that this bracket is still open */
5256 code += 1 + LINK_SIZE + skipbytes;
5258 /* Loop for each alternative branch */
5260 orig_bracount = max_bracount = cd->bracount;
5263 /* For a (?| group, reset the capturing bracket count so that each branch
5264 uses the same numbers. */
5266 if (reset_bracount) cd->bracount = orig_bracount;
5268 /* Handle a change of ims options at the start of the branch */
5270 if ((options & PCRE_IMS) != oldims)
5273 *code++ = options & PCRE_IMS;
5277 /* Set up dummy OP_REVERSE if lookbehind assertion */
5281 *code++ = OP_REVERSE;
5282 reverse_count = code;
5284 length += 1 + LINK_SIZE;
5287 /* Now compile the branch; in the pre-compile phase its length gets added
5290 if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5291 &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5297 /* Keep the highest bracket count in case (?| was used and some branch
5298 has fewer than the rest. */
5300 if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5302 /* In the real compile phase, there is some post-processing to be done. */
5304 if (lengthptr == NULL)
5306 /* If this is the first branch, the firstbyte and reqbyte values for the
5307 branch become the values for the regex. */
5309 if (*last_branch != OP_ALT)
5311 firstbyte = branchfirstbyte;
5312 reqbyte = branchreqbyte;
5315 /* If this is not the first branch, the first char and reqbyte have to
5316 match the values from all the previous branches, except that if the
5317 previous value for reqbyte didn't have REQ_VARY set, it can still match,
5318 and we set REQ_VARY for the regex. */
5322 /* If we previously had a firstbyte, but it doesn't match the new branch,
5323 we have to abandon the firstbyte for the regex, but if there was
5324 previously no reqbyte, it takes on the value of the old firstbyte. */
5326 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5328 if (reqbyte < 0) reqbyte = firstbyte;
5329 firstbyte = REQ_NONE;
5332 /* If we (now or from before) have no firstbyte, a firstbyte from the
5333 branch becomes a reqbyte if there isn't a branch reqbyte. */
5335 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5336 branchreqbyte = branchfirstbyte;
5338 /* Now ensure that the reqbytes match */
5340 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5342 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
5345 /* If lookbehind, check that this branch matches a fixed-length string, and
5346 put the length into the OP_REVERSE item. Temporarily mark the end of the
5347 branch with OP_END. */
5353 fixed_length = find_fixedlength(last_branch, options);
5354 DPRINTF(("fixed length = %d\n", fixed_length));
5355 if (fixed_length < 0)
5357 *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5361 PUT(reverse_count, 0, fixed_length);
5365 /* Reached end of expression, either ')' or end of pattern. In the real
5366 compile phase, go back through the alternative branches and reverse the chain
5367 of offsets, with the field in the BRA item now becoming an offset to the
5368 first alternative. If there are no alternatives, it points to the end of the
5369 group. The length in the terminating ket is always the length of the whole
5370 bracketed item. If any of the ims options were changed inside the group,
5371 compile a resetting op-code following, except at the very end of the pattern.
5372 Return leaving the pointer at the terminating char. */
5376 if (lengthptr == NULL)
5378 int branch_length = code - last_branch;
5381 int prev_length = GET(last_branch, 1);
5382 PUT(last_branch, 1, branch_length);
5383 branch_length = prev_length;
5384 last_branch -= branch_length;
5386 while (branch_length > 0);
5389 /* Fill in the ket */
5392 PUT(code, 1, code - start_bracket);
5393 code += 1 + LINK_SIZE;
5395 /* Resetting option if needed */
5397 if ((options & PCRE_IMS) != oldims && *ptr == ')')
5404 /* Retain the highest bracket number, in case resetting was used. */
5406 cd->bracount = max_bracount;
5408 /* Set values to pass back */
5412 *firstbyteptr = firstbyte;
5413 *reqbyteptr = reqbyte;
5414 if (lengthptr != NULL)
5416 if (OFLOW_MAX - *lengthptr < length)
5418 *errorcodeptr = ERR20;
5421 *lengthptr += length;
5426 /* Another branch follows. In the pre-compile phase, we can move the code
5427 pointer back to where it was for the start of the first branch. (That is,
5428 pretend that each branch is the only one.)
5430 In the real compile phase, insert an ALT node. Its length field points back
5431 to the previous branch while the bracket remains open. At the end the chain
5432 is reversed. It's done like this so that the start of the bracket has a
5433 zero offset until it is closed, making it possible to detect recursion. */
5435 if (lengthptr != NULL)
5437 code = *codeptr + 1 + LINK_SIZE + skipbytes;
5438 length += 1 + LINK_SIZE;
5443 PUT(code, 1, code - last_branch);
5444 bc.current = last_branch = code;
5445 code += 1 + LINK_SIZE;
5450 /* Control never reaches here */
5456 /*************************************************
5457 * Check for anchored expression *
5458 *************************************************/
5460 /* Try to find out if this is an anchored regular expression. Consider each
5461 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5462 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5463 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5464 counts, since OP_CIRC can match in the middle.
5466 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5467 This is the code for \G, which means "match at start of match position, taking
5468 into account the match offset".
5470 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5471 because that will try the rest of the pattern at all possible matching points,
5472 so there is no point trying again.... er ....
5474 .... except when the .* appears inside capturing parentheses, and there is a
5475 subsequent back reference to those parentheses. We haven't enough information
5476 to catch that case precisely.
5478 At first, the best we could do was to detect when .* was in capturing brackets
5479 and the highest back reference was greater than or equal to that level.
5480 However, by keeping a bitmap of the first 31 back references, we can catch some
5481 of the more common cases more precisely.
5484 code points to start of expression (the bracket)
5485 options points to the options setting
5486 bracket_map a bitmap of which brackets we are inside while testing; this
5487 handles up to substring 31; after that we just have to take
5488 the less precise approach
5489 backref_map the back reference bitmap
5491 Returns: TRUE or FALSE
5495 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5496 unsigned int backref_map)
5499 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5500 options, PCRE_MULTILINE, FALSE);
5501 register int op = *scode;
5503 /* Non-capturing brackets */
5507 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5510 /* Capturing brackets */
5512 else if (op == OP_CBRA)
5514 int n = GET2(scode, 1+LINK_SIZE);
5515 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5516 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5519 /* Other brackets */
5521 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5523 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5526 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
5527 are or may be referenced. */
5529 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5530 op == OP_TYPEPOSSTAR) &&
5531 (*options & PCRE_DOTALL) != 0)
5533 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5536 /* Check for explicit anchoring */
5538 else if (op != OP_SOD && op != OP_SOM &&
5539 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5541 code += GET(code, 1);
5543 while (*code == OP_ALT); /* Loop for each alternative */
5549 /*************************************************
5550 * Check for starting with ^ or .* *
5551 *************************************************/
5553 /* This is called to find out if every branch starts with ^ or .* so that
5554 "first char" processing can be done to speed things up in multiline
5555 matching and for non-DOTALL patterns that start with .* (which must start at
5556 the beginning or after \n). As in the case of is_anchored() (see above), we
5557 have to take account of back references to capturing brackets that contain .*
5558 because in that case we can't make the assumption.
5561 code points to start of expression (the bracket)
5562 bracket_map a bitmap of which brackets we are inside while testing; this
5563 handles up to substring 31; after that we just have to take
5564 the less precise approach
5565 backref_map the back reference bitmap
5567 Returns: TRUE or FALSE
5571 is_startline(const uschar *code, unsigned int bracket_map,
5572 unsigned int backref_map)
5575 const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5577 register int op = *scode;
5579 /* Non-capturing brackets */
5583 if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
5586 /* Capturing brackets */
5588 else if (op == OP_CBRA)
5590 int n = GET2(scode, 1+LINK_SIZE);
5591 int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5592 if (!is_startline(scode, new_map, backref_map)) return FALSE;
5595 /* Other brackets */
5597 else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5598 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
5600 /* .* means "start at start or after \n" if it isn't in brackets that
5601 may be referenced. */
5603 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
5605 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
5608 /* Check for explicit circumflex */
5610 else if (op != OP_CIRC) return FALSE;
5612 /* Move on to the next alternative */
5614 code += GET(code, 1);
5616 while (*code == OP_ALT); /* Loop for each alternative */
5622 /*************************************************
5623 * Check for asserted fixed first char *
5624 *************************************************/
5626 /* During compilation, the "first char" settings from forward assertions are
5627 discarded, because they can cause conflicts with actual literals that follow.
5628 However, if we end up without a first char setting for an unanchored pattern,
5629 it is worth scanning the regex to see if there is an initial asserted first
5630 char. If all branches start with the same asserted char, or with a bracket all
5631 of whose alternatives start with the same asserted char (recurse ad lib), then
5632 we return that char, otherwise -1.
5635 code points to start of expression (the bracket)
5636 options pointer to the options (used to check casing changes)
5637 inassert TRUE if in an assertion
5639 Returns: -1 or the fixed first char
5643 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
5645 register int c = -1;
5648 const uschar *scode =
5649 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
5650 register int op = *scode;
5662 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
5664 if (c < 0) c = d; else if (c != d) return -1;
5667 case OP_EXACT: /* Fall through */
5675 if (!inassert) return -1;
5679 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
5681 else if (c != scode[1]) return -1;
5685 code += GET(code, 1);
5687 while (*code == OP_ALT);
5693 /*************************************************
5694 * Compile a Regular Expression *
5695 *************************************************/
5697 /* This function takes a string and returns a pointer to a block of store
5698 holding a compiled version of the expression. The original API for this
5699 function had no error code return variable; it is retained for backwards
5700 compatibility. The new function is given a new name.
5703 pattern the regular expression
5704 options various option bits
5705 errorcodeptr pointer to error code variable (pcre_compile2() only)
5706 can be NULL if you don't want a code value
5707 errorptr pointer to pointer to error text
5708 erroroffset ptr offset in pattern where error was detected
5709 tables pointer to character tables or NULL
5711 Returns: pointer to compiled data block, or NULL on error,
5712 with errorptr and erroroffset set
5715 PCRE_EXP_DEFN pcre *
5716 pcre_compile(const char *pattern, int options, const char **errorptr,
5717 int *erroroffset, const unsigned char *tables)
5719 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
5723 PCRE_EXP_DEFN pcre *
5724 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
5725 const char **errorptr, int *erroroffset, const unsigned char *tables)
5728 int length = 1; /* For final END opcode */
5729 int firstbyte, reqbyte, newline;
5731 int skipatstart = 0;
5737 const uschar *codestart;
5739 compile_data compile_block;
5740 compile_data *cd = &compile_block;
5742 /* This space is used for "compiling" into during the first phase, when we are
5743 computing the amount of memory that is needed. Compiled items are thrown away
5744 as soon as possible, so that a fairly large buffer should be sufficient for
5745 this purpose. The same space is used in the second phase for remembering where
5746 to fill in forward references to subpatterns. */
5748 uschar cworkspace[COMPILE_WORK_SIZE];
5751 /* Set this early so that early errors get offset 0. */
5753 ptr = (const uschar *)pattern;
5755 /* We can't pass back an error message if errorptr is NULL; I guess the best we
5756 can do is just return NULL, but we can set a code value if there is a code
5759 if (errorptr == NULL)
5761 if (errorcodeptr != NULL) *errorcodeptr = 99;
5766 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
5768 /* However, we can give a message for this error */
5770 if (erroroffset == NULL)
5773 goto PCRE_EARLY_ERROR_RETURN2;
5778 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
5781 utf8 = (options & PCRE_UTF8) != 0;
5782 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
5783 (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
5786 goto PCRE_EARLY_ERROR_RETURN2;
5789 if ((options & PCRE_UTF8) != 0)
5792 goto PCRE_EARLY_ERROR_RETURN;
5796 if ((options & ~PUBLIC_OPTIONS) != 0)
5799 goto PCRE_EARLY_ERROR_RETURN;
5802 /* Set up pointers to the individual character tables */
5804 if (tables == NULL) tables = _pcre_default_tables;
5805 cd->lcc = tables + lcc_offset;
5806 cd->fcc = tables + fcc_offset;
5807 cd->cbits = tables + cbits_offset;
5808 cd->ctypes = tables + ctypes_offset;
5810 /* Check for global one-time settings at the start of the pattern, and remember
5811 the offset for later. */
5813 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
5818 if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
5819 { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
5820 else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3) == 0)
5821 { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
5822 else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5) == 0)
5823 { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
5824 else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
5825 { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
5826 else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8) == 0)
5827 { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
5829 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
5830 { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
5831 else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
5832 { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
5835 options = (options & ~PCRE_NEWLINE_BITS) | newnl;
5836 else if (newbsr != 0)
5837 options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
5841 /* Check validity of \R options. */
5843 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5846 case PCRE_BSR_ANYCRLF:
5847 case PCRE_BSR_UNICODE:
5849 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5852 /* Handle different types of newline. The three bits give seven cases. The
5853 current code allows for fixed one- or two-byte sequences, plus "any" and
5856 switch (options & PCRE_NEWLINE_BITS)
5858 case 0: newline = NEWLINE; break; /* Build-time default */
5859 case PCRE_NEWLINE_CR: newline = '\r'; break;
5860 case PCRE_NEWLINE_LF: newline = '\n'; break;
5861 case PCRE_NEWLINE_CR+
5862 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
5863 case PCRE_NEWLINE_ANY: newline = -1; break;
5864 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5865 default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
5870 cd->nltype = NLTYPE_ANYCRLF;
5872 else if (newline < 0)
5874 cd->nltype = NLTYPE_ANY;
5878 cd->nltype = NLTYPE_FIXED;
5882 cd->nl[0] = (newline >> 8) & 255;
5883 cd->nl[1] = newline & 255;
5888 cd->nl[0] = newline;
5892 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
5893 references to help in deciding whether (.*) can be treated as anchored or not.
5896 cd->top_backref = 0;
5897 cd->backref_map = 0;
5899 /* Reflect pattern for debugging output */
5901 DPRINTF(("------------------------------------------------------------------\n"));
5902 DPRINTF(("%s\n", pattern));
5904 /* Pretend to compile the pattern while actually just accumulating the length
5905 of memory required. This behaviour is triggered by passing a non-NULL final
5906 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
5907 to compile parts of the pattern into; the compiled code is discarded when it is
5908 no longer needed, so hopefully this workspace will never overflow, though there
5909 is a test for its doing so. */
5912 cd->names_found = 0;
5913 cd->name_entry_size = 0;
5914 cd->name_table = NULL;
5915 cd->start_workspace = cworkspace;
5916 cd->start_code = cworkspace;
5917 cd->hwm = cworkspace;
5918 cd->start_pattern = (const uschar *)pattern;
5919 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
5920 cd->req_varyopt = 0;
5921 cd->external_options = options;
5922 cd->external_flags = 0;
5924 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
5925 don't need to look at the result of the function here. The initial options have
5926 been put into the cd block so that they can be changed if an option setting is
5927 found within the regex right at the beginning. Bringing initial option settings
5928 outside can help speed up starting point checks. */
5933 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
5934 &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
5936 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
5938 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
5939 cd->hwm - cworkspace));
5941 if (length > MAX_PATTERN_SIZE)
5944 goto PCRE_EARLY_ERROR_RETURN;
5947 /* Compute the size of data block needed and get it, either from malloc or
5948 externally provided function. Integer overflow should no longer be possible
5949 because nowadays we limit the maximum value of cd->names_found and
5950 cd->name_entry_size. */
5952 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
5953 re = (real_pcre *)(pcre_malloc)(size);
5958 goto PCRE_EARLY_ERROR_RETURN;
5961 /* Put in the magic number, and save the sizes, initial options, internal
5962 flags, and character table pointer. NULL is used for the default character
5963 tables. The nullpad field is at the end; it's there to help in the case when a
5964 regex compiled on a system with 4-byte pointers is run on another with 8-byte
5967 re->magic_number = MAGIC_NUMBER;
5969 re->options = cd->external_options;
5970 re->flags = cd->external_flags;
5974 re->name_table_offset = sizeof(real_pcre);
5975 re->name_entry_size = cd->name_entry_size;
5976 re->name_count = cd->names_found;
5978 re->tables = (tables == _pcre_default_tables)? NULL : tables;
5981 /* The starting points of the name/number translation table and of the code are
5982 passed around in the compile data block. The start/end pattern and initial
5983 options are already set from the pre-compile phase, as is the name_entry_size
5984 field. Reset the bracket count and the names_found field. Also reset the hwm
5985 field; this time it's used for remembering forward references to subpatterns.
5989 cd->names_found = 0;
5990 cd->name_table = (uschar *)re + re->name_table_offset;
5991 codestart = cd->name_table + re->name_entry_size * re->name_count;
5992 cd->start_code = codestart;
5993 cd->hwm = cworkspace;
5994 cd->req_varyopt = 0;
5995 cd->had_accept = FALSE;
5997 /* Set up a starting, non-extracting bracket, then compile the expression. On
5998 error, errorcode will be set non-zero, so we don't need to look at the result
5999 of the function here. */
6001 ptr = (const uschar *)pattern + skipatstart;
6002 code = (uschar *)codestart;
6004 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6005 &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6006 re->top_bracket = cd->bracount;
6007 re->top_backref = cd->top_backref;
6008 re->flags = cd->external_flags;
6010 if (cd->had_accept) reqbyte = -1; /* Must disable after (*ACCEPT) */
6012 /* If not reached end of pattern on success, there's an excess bracket. */
6014 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6016 /* Fill in the terminating state and check for disastrous overflow, but
6017 if debugging, leave the test till after things are printed out. */
6022 if (code - codestart > length) errorcode = ERR23;
6025 /* Fill in any forward references that are required. */
6027 while (errorcode == 0 && cd->hwm > cworkspace)
6030 const uschar *groupptr;
6031 cd->hwm -= LINK_SIZE;
6032 offset = GET(cd->hwm, 0);
6033 recno = GET(codestart, offset);
6034 groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6035 if (groupptr == NULL) errorcode = ERR53;
6036 else PUT(((uschar *)codestart), offset, groupptr - codestart);
6039 /* Give an error if there's back reference to a non-existent capturing
6042 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6044 /* Failed to compile, or error while post-processing */
6049 PCRE_EARLY_ERROR_RETURN:
6050 *erroroffset = ptr - (const uschar *)pattern;
6051 PCRE_EARLY_ERROR_RETURN2:
6052 *errorptr = find_error_text(errorcode);
6053 if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6057 /* If the anchored option was not passed, set the flag if we can determine that
6058 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6059 as starting with .* when DOTALL is set).
6061 Otherwise, if we know what the first byte has to be, save it, because that
6062 speeds up unanchored matches no end. If not, see if we can set the
6063 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6064 start with ^. and also when all branches start with .* for non-DOTALL matches.
6067 if ((re->options & PCRE_ANCHORED) == 0)
6069 int temp_options = re->options; /* May get changed during these scans */
6070 if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6071 re->options |= PCRE_ANCHORED;
6075 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6076 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
6078 int ch = firstbyte & 255;
6079 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6080 cd->fcc[ch] == ch)? ch : firstbyte;
6081 re->flags |= PCRE_FIRSTSET;
6083 else if (is_startline(codestart, 0, cd->backref_map))
6084 re->flags |= PCRE_STARTLINE;
6088 /* For an anchored pattern, we use the "required byte" only if it follows a
6089 variable length item in the regex. Remove the caseless flag for non-caseable
6093 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6095 int ch = reqbyte & 255;
6096 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6097 cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6098 re->flags |= PCRE_REQCHSET;
6101 /* Print out the compiled data if debugging is enabled. This is never the
6102 case when building a production library. */
6106 printf("Length = %d top_bracket = %d top_backref = %d\n",
6107 length, re->top_bracket, re->top_backref);
6109 printf("Options=%08x\n", re->options);
6111 if ((re->flags & PCRE_FIRSTSET) != 0)
6113 int ch = re->first_byte & 255;
6114 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6116 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6117 else printf("First char = \\x%02x%s\n", ch, caseless);
6120 if ((re->flags & PCRE_REQCHSET) != 0)
6122 int ch = re->req_byte & 255;
6123 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6125 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6126 else printf("Req char = \\x%02x%s\n", ch, caseless);
6129 pcre_printint(re, stdout, TRUE);
6131 /* This check is done here in the debugging case so that the code that
6132 was compiled can be seen. */
6134 if (code - codestart > length)
6137 *errorptr = find_error_text(ERR23);
6138 *erroroffset = ptr - (uschar *)pattern;
6139 if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6147 /* End of pcre_compile.c */