1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2004 University of Cambridge
14 -----------------------------------------------------------------------------
15 Redistribution and use in source and binary forms, with or without
16 modification, are permitted provided that the following conditions are met:
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
29 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 POSSIBILITY OF SUCH DAMAGE.
40 -----------------------------------------------------------------------------
44 /* Define DEBUG to get debugging output on stdout. */
47 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
48 inline, and there are *still* stupid compilers about that don't like indented
49 pre-processor statements. I suppose it's only been 10 years... */
52 #define DPRINTF(p) printf p
54 #define DPRINTF(p) /*nothing*/
57 /* Include the internals header, which itself includes "config.h", the Standard
58 C headers, and the external pcre header. */
62 /* If Unicode Property support is wanted, include a private copy of the
63 function that does it, and the table that translates names to numbers. */
67 #include "ucptypetable.c"
70 /* Maximum number of items on the nested bracket stacks at compile time. This
71 applies to the nesting of all kinds of parentheses. It does not limit
72 un-nested, non-capturing parentheses. This number can be made bigger if
73 necessary - it is used to dimension one int and one unsigned char vector at
76 #define BRASTACK_SIZE 200
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
83 #define REC_STACK_SAVE_MAX 30
86 /* The maximum remaining length of subject we are prepared to search for a
89 #define REQ_BYTE_MAX 1000
92 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
93 the definition is next to the definition of the opcodes in internal.h. */
95 static const uschar OP_lengths[] = { OP_LENGTHS };
97 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
100 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
107 #if !EBCDIC /* This is the "normal" table for ASCII systems */
108 static const short int escapes[] = {
109 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
110 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
111 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
112 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
113 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
114 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
115 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
116 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
117 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
118 0, 0, -ESC_z /* x - z */
121 #else /* This is the "abnormal" table for EBCDIC systems */
122 static const short int escapes[] = {
123 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
124 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
125 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
126 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
127 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
128 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
129 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
130 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
131 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
132 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
133 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
134 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
135 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
136 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
137 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
138 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
139 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
140 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
141 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
142 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
143 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
144 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
145 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
150 /* Tables of names of POSIX character classes and their lengths. The list is
151 terminated by a zero length entry. The first three must be alpha, upper, lower,
152 as this is assumed for handling case independence. */
154 static const char *const posix_names[] = {
155 "alpha", "lower", "upper",
156 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
157 "print", "punct", "space", "word", "xdigit" };
159 static const uschar posix_name_lengths[] = {
160 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
162 /* Table of class bit maps for each POSIX class; up to three may be combined
163 to form the class. The table for [:blank:] is dynamically modified to remove
164 the vertical space characters. */
166 static const int posix_class_maps[] = {
167 cbit_lower, cbit_upper, -1, /* alpha */
168 cbit_lower, -1, -1, /* lower */
169 cbit_upper, -1, -1, /* upper */
170 cbit_digit, cbit_lower, cbit_upper, /* alnum */
171 cbit_print, cbit_cntrl, -1, /* ascii */
172 cbit_space, -1, -1, /* blank - a GNU extension */
173 cbit_cntrl, -1, -1, /* cntrl */
174 cbit_digit, -1, -1, /* digit */
175 cbit_graph, -1, -1, /* graph */
176 cbit_print, -1, -1, /* print */
177 cbit_punct, -1, -1, /* punct */
178 cbit_space, -1, -1, /* space */
179 cbit_word, -1, -1, /* word - a Perl extension */
180 cbit_xdigit,-1, -1 /* xdigit */
183 /* Table to identify digits and hex digits. This is used when compiling
184 patterns. Note that the tables in chartables are dependent on the locale, and
185 may mark arbitrary characters as digits - but the PCRE compiling code expects
186 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
187 a private table here. It costs 256 bytes, but it is a lot faster than doing
188 character value tests (at least in some simple cases I timed), and in some
189 applications one wants PCRE to compile efficiently as well as match
192 For convenience, we use the same bit definitions as in chartables:
195 0x08 hexadecimal digit
197 Then we can use ctype_digit and ctype_xdigit in the code. */
199 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
200 static const unsigned char digitab[] =
202 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
203 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
208 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
209 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
210 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
211 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
212 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
214 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
216 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
235 #else /* This is the "abnormal" case, for EBCDIC systems */
236 static const unsigned char digitab[] =
238 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
239 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
254 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
256 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
262 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
264 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
268 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
269 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
271 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
272 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
273 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
274 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
275 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
276 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
278 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
280 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
281 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
282 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
283 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
285 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
287 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
288 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
289 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
290 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
291 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
292 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
293 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
294 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
295 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
296 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
297 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
298 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
299 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
300 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
301 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
302 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
303 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
307 /* Definition to allow mutual recursion */
310 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
311 BOOL, int, int *, int *, branch_chain *, compile_data *);
313 /* Structure for building a chain of data that actually lives on the
314 stack, for holding the values of the subject pointer at the start of each
315 subpattern, so as to detect when an empty string has been matched by a
316 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
317 are on the heap, not on the stack. */
319 typedef struct eptrblock {
320 struct eptrblock *epb_prev;
321 const uschar *epb_saved_eptr;
324 /* Flag bits for the match() function */
326 #define match_condassert 0x01 /* Called to check a condition assertion */
327 #define match_isgroup 0x02 /* Set if start of bracketed group */
329 /* Non-error returns from the match() function. Error returns are externally
330 defined PCRE_ERROR_xxx codes, which are all negative. */
332 #define MATCH_MATCH 1
333 #define MATCH_NOMATCH 0
337 /*************************************************
339 *************************************************/
341 /* PCRE is thread-clean and doesn't use any global variables in the normal
342 sense. However, it calls memory allocation and free functions via the four
343 indirections below, and it can optionally do callouts. These values can be
344 changed by the caller, but are shared between all threads. However, when
345 compiling for Virtual Pascal, things are done differently (see pcre.in). */
349 extern "C" void *(*pcre_malloc)(size_t) = malloc;
350 extern "C" void (*pcre_free)(void *) = free;
351 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
352 extern "C" void (*pcre_stack_free)(void *) = free;
353 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
355 void *(*pcre_malloc)(size_t) = malloc;
356 void (*pcre_free)(void *) = free;
357 void *(*pcre_stack_malloc)(size_t) = malloc;
358 void (*pcre_stack_free)(void *) = free;
359 int (*pcre_callout)(pcre_callout_block *) = NULL;
364 /*************************************************
365 * Macros and tables for character handling *
366 *************************************************/
368 /* When UTF-8 encoding is being used, a character is no longer just a single
369 byte. The macros for character handling generate simple sequences when used in
370 byte-mode, and more complicated ones for UTF-8 characters. */
373 #define GETCHAR(c, eptr) c = *eptr;
374 #define GETCHARINC(c, eptr) c = *eptr++;
375 #define GETCHARINCTEST(c, eptr) c = *eptr++;
376 #define GETCHARLEN(c, eptr, len) c = *eptr;
377 #define BACKCHAR(eptr)
379 #else /* SUPPORT_UTF8 */
381 /* Get the next UTF-8 character, not advancing the pointer. This is called when
382 we know we are in UTF-8 mode. */
384 #define GETCHAR(c, eptr) \
386 if ((c & 0xc0) == 0xc0) \
389 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
391 c = (c & utf8_table3[gcaa]) << gcss; \
392 for (gcii = 1; gcii <= gcaa; gcii++) \
395 c |= (eptr[gcii] & 0x3f) << gcss; \
399 /* Get the next UTF-8 character, advancing the pointer. This is called when we
400 know we are in UTF-8 mode. */
402 #define GETCHARINC(c, eptr) \
404 if ((c & 0xc0) == 0xc0) \
406 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
408 c = (c & utf8_table3[gcaa]) << gcss; \
412 c |= (*eptr++ & 0x3f) << gcss; \
416 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
418 #define GETCHARINCTEST(c, eptr) \
420 if (md->utf8 && (c & 0xc0) == 0xc0) \
422 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
424 c = (c & utf8_table3[gcaa]) << gcss; \
428 c |= (*eptr++ & 0x3f) << gcss; \
432 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
433 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
435 #define GETCHARLEN(c, eptr, len) \
437 if ((c & 0xc0) == 0xc0) \
440 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
442 c = (c & utf8_table3[gcaa]) << gcss; \
443 for (gcii = 1; gcii <= gcaa; gcii++) \
446 c |= (eptr[gcii] & 0x3f) << gcss; \
451 /* If the pointer is not at the start of a character, move it back until
452 it is. Called only in UTF-8 mode. */
454 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
460 /*************************************************
461 * Default character tables *
462 *************************************************/
464 /* A default set of character tables is included in the PCRE binary. Its source
465 is built by the maketables auxiliary program, which uses the default C ctypes
466 functions, and put in the file chartables.c. These tables are used by PCRE
467 whenever the caller of pcre_compile() does not provide an alternate set of
470 #include "chartables.c"
475 /*************************************************
476 * Tables for UTF-8 support *
477 *************************************************/
479 /* These are the breakpoints for different numbers of bytes in a UTF-8
482 static const int utf8_table1[] =
483 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
485 /* These are the indicator bits and the mask for the data bits to set in the
486 first byte of a character, indexed by the number of additional bytes. */
488 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
489 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
491 /* Table of the number of extra characters, indexed by the first character
492 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
495 static const uschar utf8_table4[] = {
496 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
497 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
498 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
499 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
502 /*************************************************
503 * Convert character value to UTF-8 *
504 *************************************************/
506 /* This function takes an integer value in the range 0 - 0x7fffffff
507 and encodes it as a UTF-8 character in 0 to 6 bytes.
510 cvalue the character value
511 buffer pointer to buffer for result - at least 6 bytes long
513 Returns: number of characters placed in the buffer
517 ord2utf8(int cvalue, uschar *buffer)
520 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
521 if (cvalue <= utf8_table1[i]) break;
523 for (j = i; j > 0; j--)
525 *buffer-- = 0x80 | (cvalue & 0x3f);
528 *buffer = utf8_table2[i] | cvalue;
535 /*************************************************
536 * Print compiled regex *
537 *************************************************/
539 /* The code for doing this is held in a separate file that is also included in
540 pcretest.c. It defines a function called print_internals(). */
543 #include "printint.c"
548 /*************************************************
549 * Return version string *
550 *************************************************/
552 #define STRING(a) # a
553 #define XSTRING(s) STRING(s)
558 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
564 /*************************************************
565 * Flip bytes in an integer *
566 *************************************************/
568 /* This function is called when the magic number in a regex doesn't match in
569 order to flip its bytes to see if we are dealing with a pattern that was
570 compiled on a host of different endianness. If so, this function is used to
571 flip other byte values.
574 value the number to flip
575 n the number of bytes to flip (assumed to be 2 or 4)
577 Returns: the flipped value
581 byteflip(long int value, int n)
583 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
584 return ((value & 0x000000ff) << 24) |
585 ((value & 0x0000ff00) << 8) |
586 ((value & 0x00ff0000) >> 8) |
587 ((value & 0xff000000) >> 24);
592 /*************************************************
593 * Test for a byte-flipped compiled regex *
594 *************************************************/
596 /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
597 job is to test whether the regex is byte-flipped - that is, it was compiled on
598 a system of opposite endianness. The function is called only when the native
599 MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
600 relevant values into a different data block, and return it.
603 re points to the regex
604 study points to study data, or NULL
605 internal_re points to a new regex block
606 internal_study points to a new study block
608 Returns: the new block if is is indeed a byte-flipped regex
613 try_flipped(const real_pcre *re, real_pcre *internal_re,
614 const pcre_study_data *study, pcre_study_data *internal_study)
616 if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
619 *internal_re = *re; /* To copy other fields */
620 internal_re->size = byteflip(re->size, sizeof(re->size));
621 internal_re->options = byteflip(re->options, sizeof(re->options));
622 internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
623 internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
624 internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
625 internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
626 internal_re->name_table_offset = byteflip(re->name_table_offset,
627 sizeof(re->name_table_offset));
628 internal_re->name_entry_size = byteflip(re->name_entry_size,
629 sizeof(re->name_entry_size));
630 internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
634 *internal_study = *study; /* To copy other fields */
635 internal_study->size = byteflip(study->size, sizeof(study->size));
636 internal_study->options = byteflip(study->options, sizeof(study->options));
644 /*************************************************
645 * (Obsolete) Return info about compiled pattern *
646 *************************************************/
648 /* This is the original "info" function. It picks potentially useful data out
649 of the private structure, but its interface was too rigid. It remains for
650 backwards compatibility. The public options are passed back in an int - though
651 the re->options field has been expanded to a long int, all the public options
652 at the low end of it, and so even on 16-bit systems this will still be OK.
653 Therefore, I haven't changed the API for pcre_info().
656 argument_re points to compiled code
657 optptr where to pass back the options
658 first_byte where to pass back the first character,
659 or -1 if multiline and all branches start ^,
662 Returns: number of capturing subpatterns
663 or negative values on error
667 pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
669 real_pcre internal_re;
670 const real_pcre *re = (const real_pcre *)argument_re;
671 if (re == NULL) return PCRE_ERROR_NULL;
672 if (re->magic_number != MAGIC_NUMBER)
674 re = try_flipped(re, &internal_re, NULL, NULL);
675 if (re == NULL) return PCRE_ERROR_BADMAGIC;
677 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
678 if (first_byte != NULL)
679 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
680 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
681 return re->top_bracket;
686 /*************************************************
687 * Return info about compiled pattern *
688 *************************************************/
690 /* This is a newer "info" function which has an extensible interface so
691 that additional items can be added compatibly.
694 argument_re points to compiled code
695 extra_data points extra data, or NULL
696 what what information is required
697 where where to put the information
699 Returns: 0 if data returned, negative on error
703 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
706 real_pcre internal_re;
707 pcre_study_data internal_study;
708 const real_pcre *re = (const real_pcre *)argument_re;
709 const pcre_study_data *study = NULL;
711 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
713 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
714 study = (const pcre_study_data *)extra_data->study_data;
716 if (re->magic_number != MAGIC_NUMBER)
718 re = try_flipped(re, &internal_re, study, &internal_study);
719 if (re == NULL) return PCRE_ERROR_BADMAGIC;
720 if (study != NULL) study = &internal_study;
725 case PCRE_INFO_OPTIONS:
726 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
730 *((size_t *)where) = re->size;
733 case PCRE_INFO_STUDYSIZE:
734 *((size_t *)where) = (study == NULL)? 0 : study->size;
737 case PCRE_INFO_CAPTURECOUNT:
738 *((int *)where) = re->top_bracket;
741 case PCRE_INFO_BACKREFMAX:
742 *((int *)where) = re->top_backref;
745 case PCRE_INFO_FIRSTBYTE:
747 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
748 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
751 /* Make sure we pass back the pointer to the bit vector in the external
752 block, not the internal copy (with flipped integer fields). */
754 case PCRE_INFO_FIRSTTABLE:
755 *((const uschar **)where) =
756 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
757 ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
760 case PCRE_INFO_LASTLITERAL:
762 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
765 case PCRE_INFO_NAMEENTRYSIZE:
766 *((int *)where) = re->name_entry_size;
769 case PCRE_INFO_NAMECOUNT:
770 *((int *)where) = re->name_count;
773 case PCRE_INFO_NAMETABLE:
774 *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
777 case PCRE_INFO_DEFAULT_TABLES:
778 *((const uschar **)where) = (const uschar *)pcre_default_tables;
781 default: return PCRE_ERROR_BADOPTION;
789 /*************************************************
790 * Return info about what features are configured *
791 *************************************************/
793 /* This is function which has an extensible interface so that additional items
794 can be added compatibly.
797 what what information is required
798 where where to put the information
800 Returns: 0 if data returned, negative on error
804 pcre_config(int what, void *where)
808 case PCRE_CONFIG_UTF8:
816 case PCRE_CONFIG_UNICODE_PROPERTIES:
824 case PCRE_CONFIG_NEWLINE:
825 *((int *)where) = NEWLINE;
828 case PCRE_CONFIG_LINK_SIZE:
829 *((int *)where) = LINK_SIZE;
832 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
833 *((int *)where) = POSIX_MALLOC_THRESHOLD;
836 case PCRE_CONFIG_MATCH_LIMIT:
837 *((unsigned int *)where) = MATCH_LIMIT;
840 case PCRE_CONFIG_STACKRECURSE:
848 default: return PCRE_ERROR_BADOPTION;
857 /*************************************************
858 * Debugging function to print chars *
859 *************************************************/
861 /* Print a sequence of chars in printable format, stopping at the end of the
862 subject if the requested.
865 p points to characters
866 length number to print
867 is_subject TRUE if printing from within md->start_subject
868 md pointer to matching data block, if is_subject is TRUE
874 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
877 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
879 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
886 /*************************************************
888 *************************************************/
890 /* This function is called when a \ has been encountered. It either returns a
891 positive value for a simple escape such as \n, or a negative value which
892 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
893 a positive value greater than 255 may be returned. On entry, ptr is pointing at
894 the \. On exit, it is on the final character of the escape sequence.
897 ptrptr points to the pattern position pointer
898 errorptr points to the pointer to the error message
899 bracount number of previous extracting brackets
900 options the options bits
901 isclass TRUE if inside a character class
903 Returns: zero or positive => a data character
904 negative => a special escape sequence
905 on error, errorptr is set
909 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
910 int options, BOOL isclass)
912 const uschar *ptr = *ptrptr;
915 /* If backslash is at the end of the pattern, it's an error. */
918 if (c == 0) *errorptr = ERR1;
920 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
921 a table. A non-zero result is something that can be returned immediately.
922 Otherwise further processing may be required. */
924 #if !EBCDIC /* ASCII coding */
925 else if (c < '0' || c > 'z') {} /* Not alphameric */
926 else if ((i = escapes[c - '0']) != 0) c = i;
928 #else /* EBCDIC coding */
929 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
930 else if ((i = escapes[c - 0x48]) != 0) c = i;
933 /* Escapes that need further processing, or are illegal. */
937 const uschar *oldptr;
940 /* A number of Perl escapes are not handled by PCRE. We give an explicit
951 /* The handling of escape sequences consisting of a string of digits
952 starting with one that is not zero is not straightforward. By experiment,
953 the way Perl works seems to be as follows:
955 Outside a character class, the digits are read as a decimal number. If the
956 number is less than 10, or if there are that many previous extracting
957 left brackets, then it is a back reference. Otherwise, up to three octal
958 digits are read to form an escaped byte. Thus \123 is likely to be octal
959 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
960 value is greater than 377, the least significant 8 bits are taken. Inside a
961 character class, \ followed by a digit is always an octal number. */
963 case '1': case '2': case '3': case '4': case '5':
964 case '6': case '7': case '8': case '9':
970 while ((digitab[ptr[1]] & ctype_digit) != 0)
971 c = c * 10 + *(++ptr) - '0';
972 if (c < 10 || c <= bracount)
977 ptr = oldptr; /* Put the pointer back and fall through */
980 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
981 generates a binary zero byte and treats the digit as a following literal.
982 Thus we have to pull back the pointer by one. */
984 if ((c = *ptr) >= '8')
991 /* \0 always starts an octal number, but we may drop through to here with a
992 larger first octal digit. */
996 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
997 c = c * 8 + *(++ptr) - '0';
998 c &= 255; /* Take least significant 8 bits */
1001 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1002 which can be greater than 0xff, but only if the ddd are hex digits. */
1006 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1008 const uschar *pt = ptr + 2;
1009 register int count = 0;
1011 while ((digitab[*pt] & ctype_xdigit) != 0)
1015 #if !EBCDIC /* ASCII coding */
1016 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1017 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1018 #else /* EBCDIC coding */
1019 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
1020 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1025 if (c < 0 || count > 8) *errorptr = ERR34;
1029 /* If the sequence of hex digits does not end with '}', then we don't
1030 recognize this construct; fall through to the normal \x handling. */
1034 /* Read just a single hex char */
1037 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1039 int cc; /* Some compilers don't like ++ */
1040 cc = *(++ptr); /* in initializers */
1041 #if !EBCDIC /* ASCII coding */
1042 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1043 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1044 #else /* EBCDIC coding */
1045 if (cc <= 'z') cc += 64; /* Convert to upper case */
1046 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1051 /* Other special escapes not starting with a digit are straightforward */
1061 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1062 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1063 (However, an EBCDIC equivalent has now been added.) */
1065 #if !EBCDIC /* ASCII coding */
1066 if (c >= 'a' && c <= 'z') c -= 32;
1068 #else /* EBCDIC coding */
1069 if (c >= 'a' && c <= 'z') c += 64;
1074 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1075 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1076 for Perl compatibility, it is a literal. This code looks a bit odd, but
1077 there used to be some cases other than the default, and there may be again
1078 in future, so I haven't "optimized" it. */
1081 if ((options & PCRE_EXTRA) != 0) switch(c)
1098 /*************************************************
1099 * Handle \P and \p *
1100 *************************************************/
1102 /* This function is called after \P or \p has been encountered, provided that
1103 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1104 pointing at the P or p. On exit, it is pointing at the final character of the
1108 ptrptr points to the pattern position pointer
1109 negptr points to a boolean that is set TRUE for negation else FALSE
1110 errorptr points to the pointer to the error message
1112 Returns: value from ucp_type_table, or -1 for an invalid type
1116 get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1119 const uschar *ptr = *ptrptr;
1123 if (c == 0) goto ERROR_RETURN;
1127 /* \P or \p can be followed by a one- or two-character name in {}, optionally
1128 preceded by ^ for negation. */
1137 for (i = 0; i <= 2; i++)
1140 if (c == 0) goto ERROR_RETURN;
1141 if (c == '}') break;
1144 if (c !='}') /* Try to distinguish error cases */
1146 while (*(++ptr) != 0 && *ptr != '}');
1147 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1152 /* Otherwise there is just one following character */
1162 /* Search for a recognized property name using binary chop */
1165 top = sizeof(utt)/sizeof(ucp_type_table);
1170 c = strcmp(name, utt[i].name);
1171 if (c == 0) return utt[i].value;
1172 if (c > 0) bot = i + 1; else top = i;
1190 /*************************************************
1191 * Check for counted repeat *
1192 *************************************************/
1194 /* This function is called when a '{' is encountered in a place where it might
1195 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1196 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1197 where the ddds are digits.
1200 p pointer to the first char after '{'
1202 Returns: TRUE or FALSE
1206 is_counted_repeat(const uschar *p)
1208 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1209 while ((digitab[*p] & ctype_digit) != 0) p++;
1210 if (*p == '}') return TRUE;
1212 if (*p++ != ',') return FALSE;
1213 if (*p == '}') return TRUE;
1215 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1216 while ((digitab[*p] & ctype_digit) != 0) p++;
1223 /*************************************************
1224 * Read repeat counts *
1225 *************************************************/
1227 /* Read an item of the form {n,m} and return the values. This is called only
1228 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1229 so the syntax is guaranteed to be correct, but we need to check the values.
1232 p pointer to first char after '{'
1233 minp pointer to int for min
1234 maxp pointer to int for max
1235 returned as -1 if no max
1236 errorptr points to pointer to error message
1238 Returns: pointer to '}' on success;
1239 current ptr on error, with errorptr set
1242 static const uschar *
1243 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1248 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1250 if (*p == '}') max = min; else
1255 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1264 /* Do paranoid checks, then fill in the required variables, and pass back the
1265 pointer to the terminating '}'. */
1267 if (min > 65535 || max > 65535)
1279 /*************************************************
1280 * Find first significant op code *
1281 *************************************************/
1283 /* This is called by several functions that scan a compiled expression looking
1284 for a fixed first character, or an anchoring op code etc. It skips over things
1285 that do not influence this. For some calls, a change of option is important.
1286 For some calls, it makes sense to skip negative forward and all backward
1287 assertions, and also the \b assertion; for others it does not.
1290 code pointer to the start of the group
1291 options pointer to external options
1292 optbit the option bit whose changing is significant, or
1294 skipassert TRUE if certain assertions are to be skipped
1296 Returns: pointer to the first significant opcode
1299 static const uschar*
1300 first_significant_code(const uschar *code, int *options, int optbit,
1308 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1309 *options = (int)code[1];
1315 case OP_ASSERTBACK_NOT:
1316 if (!skipassert) return code;
1317 do code += GET(code, 1); while (*code == OP_ALT);
1318 code += OP_lengths[*code];
1321 case OP_WORD_BOUNDARY:
1322 case OP_NOT_WORD_BOUNDARY:
1323 if (!skipassert) return code;
1329 code += OP_lengths[*code];
1336 /* Control never reaches here */
1342 /*************************************************
1343 * Find the fixed length of a pattern *
1344 *************************************************/
1346 /* Scan a pattern and compute the fixed length of subject that will match it,
1347 if the length is fixed. This is needed for dealing with backward assertions.
1348 In UTF8 mode, the result is in characters rather than bytes.
1351 code points to the start of the pattern (the bracket)
1352 options the compiling options
1354 Returns: the fixed length, or -1 if there is no fixed length,
1355 or -2 if \C was encountered
1359 find_fixedlength(uschar *code, int options)
1363 register int branchlength = 0;
1364 register uschar *cc = code + 1 + LINK_SIZE;
1366 /* Scan along the opcodes for this branch. If we get to the end of the
1367 branch, check the length against that of the other branches. */
1372 register int op = *cc;
1373 if (op >= OP_BRA) op = OP_BRA;
1380 d = find_fixedlength(cc, options);
1381 if (d < 0) return d;
1383 do cc += GET(cc, 1); while (*cc == OP_ALT);
1384 cc += 1 + LINK_SIZE;
1387 /* Reached end of a branch; if it's a ket it is the end of a nested
1388 call. If it's ALT it is an alternation in a nested call. If it is
1389 END it's the end of the outer call. All can be handled by the same code. */
1396 if (length < 0) length = branchlength;
1397 else if (length != branchlength) return -1;
1398 if (*cc != OP_ALT) return length;
1399 cc += 1 + LINK_SIZE;
1403 /* Skip over assertive subpatterns */
1408 case OP_ASSERTBACK_NOT:
1409 do cc += GET(cc, 1); while (*cc == OP_ALT);
1412 /* Skip over things that don't match chars */
1425 case OP_NOT_WORD_BOUNDARY:
1426 case OP_WORD_BOUNDARY:
1427 cc += OP_lengths[*cc];
1430 /* Handle literal characters */
1437 if ((options & PCRE_UTF8) != 0)
1439 while ((*cc & 0xc0) == 0x80) cc++;
1444 /* Handle exact repetitions. The count is already in characters, but we
1445 need to skip over a multibyte character in UTF8 mode. */
1448 branchlength += GET2(cc,1);
1451 if ((options & PCRE_UTF8) != 0)
1453 while((*cc & 0x80) == 0x80) cc++;
1459 branchlength += GET2(cc,1);
1463 /* Handle single-char matchers */
1472 case OP_NOT_WHITESPACE:
1474 case OP_NOT_WORDCHAR:
1481 /* The single-byte matcher isn't allowed */
1486 /* Check a class for variable quantification */
1490 cc += GET(cc, 1) - 33;
1508 if (GET2(cc,1) != GET2(cc,3)) return -1;
1509 branchlength += GET2(cc,1);
1518 /* Anything else is variable length */
1524 /* Control never gets here */
1530 /*************************************************
1531 * Scan compiled regex for numbered bracket *
1532 *************************************************/
1534 /* This little function scans through a compiled pattern until it finds a
1535 capturing bracket with the given number.
1538 code points to start of expression
1539 utf8 TRUE in UTF-8 mode
1540 number the required bracket number
1542 Returns: pointer to the opcode for the bracket, or NULL if not found
1545 static const uschar *
1546 find_bracket(const uschar *code, BOOL utf8, int number)
1548 #ifndef SUPPORT_UTF8
1549 utf8 = utf8; /* Stop pedantic compilers complaining */
1554 register int c = *code;
1555 if (c == OP_END) return NULL;
1556 else if (c > OP_BRA)
1559 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1560 if (n == number) return (uschar *)code;
1561 code += OP_lengths[OP_BRA];
1565 code += OP_lengths[c];
1569 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1570 by a multi-byte character. The length in the table is a minimum, so we have
1571 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1572 can use relatively efficient code. */
1587 while ((*code & 0xc0) == 0x80) code++;
1590 /* XCLASS is used for classes that cannot be represented just by a bit
1591 map. This includes negated single high-valued characters. The length in
1592 the table is zero; the actual length is stored in the compiled code. */
1595 code += GET(code, 1) + 1;
1605 /*************************************************
1606 * Scan compiled regex for recursion reference *
1607 *************************************************/
1609 /* This little function scans through a compiled pattern until it finds an
1610 instance of OP_RECURSE.
1613 code points to start of expression
1614 utf8 TRUE in UTF-8 mode
1616 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1619 static const uschar *
1620 find_recurse(const uschar *code, BOOL utf8)
1622 #ifndef SUPPORT_UTF8
1623 utf8 = utf8; /* Stop pedantic compilers complaining */
1628 register int c = *code;
1629 if (c == OP_END) return NULL;
1630 else if (c == OP_RECURSE) return code;
1631 else if (c > OP_BRA)
1633 code += OP_lengths[OP_BRA];
1637 code += OP_lengths[c];
1641 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1642 by a multi-byte character. The length in the table is a minimum, so we have
1643 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1644 can use relatively efficient code. */
1659 while ((*code & 0xc0) == 0x80) code++;
1662 /* XCLASS is used for classes that cannot be represented just by a bit
1663 map. This includes negated single high-valued characters. The length in
1664 the table is zero; the actual length is stored in the compiled code. */
1667 code += GET(code, 1) + 1;
1677 /*************************************************
1678 * Scan compiled branch for non-emptiness *
1679 *************************************************/
1681 /* This function scans through a branch of a compiled pattern to see whether it
1682 can match the empty string or not. It is called only from could_be_empty()
1683 below. Note that first_significant_code() skips over assertions. If we hit an
1684 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1685 whose current branch will already have been scanned.
1688 code points to start of search
1689 endcode points to where to stop
1690 utf8 TRUE if in UTF8 mode
1692 Returns: TRUE if what is matched could be empty
1696 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1699 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1701 code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1703 const uschar *ccode;
1710 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1712 /* Scan a closed bracket */
1714 empty_branch = FALSE;
1717 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1718 empty_branch = TRUE;
1719 code += GET(code, 1);
1721 while (*code == OP_ALT);
1722 if (!empty_branch) return FALSE; /* All branches are non-empty */
1723 code += 1 + LINK_SIZE;
1729 /* Check for quantifiers after a class */
1733 ccode = code + GET(code, 1);
1734 goto CHECK_CLASS_REPEAT;
1747 case OP_CRSTAR: /* These could be empty; continue */
1753 default: /* Non-repeat => class must match */
1754 case OP_CRPLUS: /* These repeats aren't empty */
1760 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1765 /* Opcodes that must match a character */
1772 case OP_NOT_WHITESPACE:
1774 case OP_NOT_WORDCHAR:
1788 case OP_TYPEMINPLUS:
1800 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1801 followed by a multibyte character */
1810 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1821 /*************************************************
1822 * Scan compiled regex for non-emptiness *
1823 *************************************************/
1825 /* This function is called to check for left recursive calls. We want to check
1826 the current branch of the current pattern to see if it could match the empty
1827 string. If it could, we must look outwards for branches at other levels,
1828 stopping when we pass beyond the bracket which is the subject of the recursion.
1831 code points to start of the recursion
1832 endcode points to where to stop (current RECURSE item)
1833 bcptr points to the chain of current (unclosed) branch starts
1834 utf8 TRUE if in UTF-8 mode
1836 Returns: TRUE if what is matched could be empty
1840 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1843 while (bcptr != NULL && bcptr->current >= code)
1845 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1846 bcptr = bcptr->outer;
1853 /*************************************************
1854 * Check for POSIX class syntax *
1855 *************************************************/
1857 /* This function is called when the sequence "[:" or "[." or "[=" is
1858 encountered in a character class. It checks whether this is followed by an
1859 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1863 ptr pointer to the initial [
1864 endptr where to return the end pointer
1865 cd pointer to compile data
1867 Returns: TRUE or FALSE
1871 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1873 int terminator; /* Don't combine these lines; the Solaris cc */
1874 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1875 if (*(++ptr) == '^') ptr++;
1876 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1877 if (*ptr == terminator && ptr[1] == ']')
1888 /*************************************************
1889 * Check POSIX class name *
1890 *************************************************/
1892 /* This function is called to check the name given in a POSIX-style class entry
1896 ptr points to the first letter
1897 len the length of the name
1899 Returns: a value representing the name, or -1 if unknown
1903 check_posix_name(const uschar *ptr, int len)
1905 register int yield = 0;
1906 while (posix_name_lengths[yield] != 0)
1908 if (len == posix_name_lengths[yield] &&
1909 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1916 /*************************************************
1917 * Adjust OP_RECURSE items in repeated group *
1918 *************************************************/
1920 /* OP_RECURSE items contain an offset from the start of the regex to the group
1921 that is referenced. This means that groups can be replicated for fixed
1922 repetition simply by copying (because the recursion is allowed to refer to
1923 earlier groups that are outside the current group). However, when a group is
1924 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1925 it, after it has been compiled. This means that any OP_RECURSE items within it
1926 that refer to the group itself or any contained groups have to have their
1927 offsets adjusted. That is the job of this function. Before it is called, the
1928 partially compiled regex must be temporarily terminated with OP_END.
1931 group points to the start of the group
1932 adjust the amount by which the group is to be moved
1933 utf8 TRUE in UTF-8 mode
1934 cd contains pointers to tables etc.
1940 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1942 uschar *ptr = group;
1943 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1945 int offset = GET(ptr, 1);
1946 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1947 ptr += 1 + LINK_SIZE;
1953 /*************************************************
1954 * Insert an automatic callout point *
1955 *************************************************/
1957 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1958 callout points before each pattern item.
1961 code current code pointer
1962 ptr current pattern pointer
1963 cd pointers to tables etc
1965 Returns: new code pointer
1969 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1971 *code++ = OP_CALLOUT;
1973 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1974 PUT(code, LINK_SIZE, 0); /* Default length */
1975 return code + 2*LINK_SIZE;
1980 /*************************************************
1981 * Complete a callout item *
1982 *************************************************/
1984 /* A callout item contains the length of the next item in the pattern, which
1985 we can't fill in till after we have reached the relevant point. This is used
1986 for both automatic and manual callouts.
1989 previous_callout points to previous callout item
1990 ptr current pattern pointer
1991 cd pointers to tables etc
1997 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
1999 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2000 PUT(previous_callout, 2 + LINK_SIZE, length);
2006 /*************************************************
2007 * Get othercase range *
2008 *************************************************/
2010 /* This function is passed the start and end of a class range, in UTF-8 mode
2011 with UCP support. It searches up the characters, looking for internal ranges of
2012 characters in the "other" case. Each call returns the next one, updating the
2016 cptr points to starting character value; updated
2018 ocptr where to put start of othercase range
2019 odptr where to put end of othercase range
2021 Yield: TRUE when range returned; FALSE when no more
2025 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2027 int c, chartype, othercase, next;
2029 for (c = *cptr; c <= d; c++)
2031 if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2034 if (c > d) return FALSE;
2037 next = othercase + 1;
2039 for (++c; c <= d; c++)
2041 if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2051 #endif /* SUPPORT_UCP */
2054 /*************************************************
2055 * Compile one branch *
2056 *************************************************/
2058 /* Scan the pattern, compiling it into the code vector. If the options are
2059 changed during the branch, the pointer is used to change the external options
2063 optionsptr pointer to the option bits
2064 brackets points to number of extracting brackets used
2065 codeptr points to the pointer to the current code point
2066 ptrptr points to the current pattern pointer
2067 errorptr points to pointer to error message
2068 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2069 reqbyteptr set to the last literal character required, else < 0
2070 bcptr points to current branch chain
2071 cd contains pointers to tables etc.
2073 Returns: TRUE on success
2074 FALSE, with *errorptr set on error
2078 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2079 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2080 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2082 int repeat_type, op_type;
2083 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2085 int greedy_default, greedy_non_default;
2086 int firstbyte, reqbyte;
2087 int zeroreqbyte, zerofirstbyte;
2088 int req_caseopt, reqvary, tempreqvary;
2090 int options = *optionsptr;
2091 int after_manual_callout = 0;
2093 register uschar *code = *codeptr;
2095 BOOL inescq = FALSE;
2096 BOOL groupsetfirstbyte = FALSE;
2097 const uschar *ptr = *ptrptr;
2098 const uschar *tempptr;
2099 uschar *previous = NULL;
2100 uschar *previous_callout = NULL;
2101 uschar classbits[32];
2105 BOOL utf8 = (options & PCRE_UTF8) != 0;
2106 uschar *class_utf8data;
2107 uschar utf8_char[6];
2112 /* Set up the default and non-default settings for greediness */
2114 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2115 greedy_non_default = greedy_default ^ 1;
2117 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2118 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2119 matches a non-fixed char first char; reqbyte just remains unset if we never
2122 When we hit a repeat whose minimum is zero, we may have to adjust these values
2123 to take the zero repeat into account. This is implemented by setting them to
2124 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2125 item types that can be repeated set these backoff variables appropriately. */
2127 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2129 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2130 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2131 value > 255. It is added into the firstbyte or reqbyte variables to record the
2132 case status of the value. This is used only for ASCII characters. */
2134 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2136 /* Switch on next character until the end of the branch */
2141 BOOL possessive_quantifier;
2143 int class_charcount;
2153 /* Next byte in the pattern */
2157 /* If in \Q...\E, check for the end; if not, we have a literal */
2159 if (inescq && c != 0)
2161 if (c == '\\' && ptr[1] == 'E')
2169 if (previous_callout != NULL)
2171 complete_callout(previous_callout, ptr, cd);
2172 previous_callout = NULL;
2174 if ((options & PCRE_AUTO_CALLOUT) != 0)
2176 previous_callout = code;
2177 code = auto_callout(code, ptr, cd);
2183 /* Fill in length of a previous callout, except when the next thing is
2186 is_quantifier = c == '*' || c == '+' || c == '?' ||
2187 (c == '{' && is_counted_repeat(ptr+1));
2189 if (!is_quantifier && previous_callout != NULL &&
2190 after_manual_callout-- <= 0)
2192 complete_callout(previous_callout, ptr, cd);
2193 previous_callout = NULL;
2196 /* In extended mode, skip white space and comments */
2198 if ((options & PCRE_EXTENDED) != 0)
2200 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2203 /* The space before the ; is to avoid a warning on a silly compiler
2204 on the Macintosh. */
2205 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2206 if (c != 0) continue; /* Else fall through to handle end of string */
2210 /* No auto callout for quantifiers. */
2212 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2214 previous_callout = code;
2215 code = auto_callout(code, ptr, cd);
2220 /* The branch terminates at end of string, |, or ). */
2225 *firstbyteptr = firstbyte;
2226 *reqbyteptr = reqbyte;
2231 /* Handle single-character metacharacters. In multiline mode, ^ disables
2232 the setting of any following char as a first character. */
2235 if ((options & PCRE_MULTILINE) != 0)
2237 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2248 /* There can never be a first char if '.' is first, whatever happens about
2249 repeats. The value of reqbyte doesn't change either. */
2252 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2253 zerofirstbyte = firstbyte;
2254 zeroreqbyte = reqbyte;
2259 /* Character classes. If the included characters are all < 255 in value, we
2260 build a 32-byte bitmap of the permitted characters, except in the special
2261 case where there is only one such character. For negated classes, we build
2262 the map as usual, then invert it at the end. However, we use a different
2263 opcode so that data characters > 255 can be handled correctly.
2265 If the class contains characters outside the 0-255 range, a different
2266 opcode is compiled. It may optionally have a bit map for characters < 256,
2267 but those above are are explicitly listed afterwards. A flag byte tells
2268 whether the bitmap is present, and whether this is a negated class or not.
2274 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2275 they are encountered at the top level, so we'll do that too. */
2277 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2278 check_posix_syntax(ptr, &tempptr, cd))
2280 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2284 /* If the first character is '^', set the negation flag and skip it. */
2286 if ((c = *(++ptr)) == '^')
2288 negate_class = TRUE;
2293 negate_class = FALSE;
2296 /* Keep a count of chars with values < 256 so that we can optimize the case
2297 of just a single character (as long as it's < 256). For higher valued UTF-8
2298 characters, we don't yet do any optimization. */
2300 class_charcount = 0;
2301 class_lastchar = -1;
2304 class_utf8 = FALSE; /* No chars >= 256 */
2305 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
2308 /* Initialize the 32-char bit map to all zeros. We have to build the
2309 map in a temporary bit of store, in case the class contains only 1
2310 character (< 256), because in that case the compiled code doesn't use the
2313 memset(classbits, 0, 32 * sizeof(uschar));
2315 /* Process characters until ] is reached. By writing this as a "do" it
2316 means that an initial ] is taken as a data character. The first pass
2317 through the regex checked the overall syntax, so we don't need to be very
2318 strict here. At the start of the loop, c contains the first byte of the
2324 if (utf8 && c > 127)
2325 { /* Braces are required because the */
2326 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2330 /* Inside \Q...\E everything is literal except \E */
2334 if (c == '\\' && ptr[1] == 'E')
2340 else goto LONE_SINGLE_CHARACTER;
2343 /* Handle POSIX class names. Perl allows a negation extension of the
2344 form [:^name:]. A square bracket that doesn't match the syntax is
2345 treated as a literal. We also recognize the POSIX constructions
2346 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2350 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2351 check_posix_syntax(ptr, &tempptr, cd))
2353 BOOL local_negate = FALSE;
2355 register const uschar *cbits = cd->cbits;
2366 local_negate = TRUE;
2370 posix_class = check_posix_name(ptr, tempptr - ptr);
2371 if (posix_class < 0)
2377 /* If matching is caseless, upper and lower are converted to
2378 alpha. This relies on the fact that the class table starts with
2379 alpha, lower, upper as the first 3 entries. */
2381 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2384 /* Or into the map we are building up to 3 of the static class
2385 tables, or their negations. The [:blank:] class sets up the same
2386 chars as the [:space:] class (all white space). We remove the vertical
2387 white space chars afterwards. */
2390 for (i = 0; i < 3; i++)
2392 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2393 int taboffset = posix_class_maps[posix_class + i];
2394 if (taboffset < 0) break;
2398 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2400 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2401 if (blankclass) classbits[1] |= 0x3c;
2405 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2406 if (blankclass) classbits[1] &= ~0x3c;
2411 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2412 continue; /* End of POSIX syntax handling */
2415 /* Backslash may introduce a single character, or it may introduce one
2416 of the specials, which just set a flag. Escaped items are checked for
2417 validity in the pre-compiling pass. The sequence \b is a special case.
2418 Inside a class (and only there) it is treated as backspace. Elsewhere
2419 it marks a word boundary. Other escapes have preset maps ready to
2420 or into the one we are building. We assume they have more than one
2421 character in them, so set class_charcount bigger than one. */
2425 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2427 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2428 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2429 else if (-c == ESC_Q) /* Handle start of quoted string */
2431 if (ptr[1] == '\\' && ptr[2] == 'E')
2433 ptr += 2; /* avoid empty string */
2441 register const uschar *cbits = cd->cbits;
2442 class_charcount += 2; /* Greater than 1 is what matters */
2446 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2450 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2454 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2458 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2462 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2463 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2467 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2468 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2476 int property = get_ucp(&ptr, &negated, errorptr);
2477 if (property < 0) goto FAILED;
2479 *class_utf8data++ = ((-c == ESC_p) != negated)?
2480 XCL_PROP : XCL_NOTPROP;
2481 *class_utf8data++ = property;
2482 class_charcount -= 2; /* Not a < 256 character */
2487 /* Unrecognized escapes are faulted if PCRE is running in its
2488 strict mode. By default, for compatibility with Perl, they are
2489 treated as literals. */
2492 if ((options & PCRE_EXTRA) != 0)
2497 c = *ptr; /* The final character */
2498 class_charcount -= 2; /* Undo the default count from above */
2502 /* Fall through if we have a single character (c >= 0). This may be
2503 > 256 in UTF-8 mode. */
2505 } /* End of backslash handling */
2507 /* A single character may be followed by '-' to form a range. However,
2508 Perl does not permit ']' to be the end of the range. A '-' character
2509 here is treated as a literal. */
2511 if (ptr[1] == '-' && ptr[2] != ']')
2518 { /* Braces are required because the */
2519 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2523 d = *ptr; /* Not UTF-8 mode */
2525 /* The second part of a range can be a single-character escape, but
2526 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2527 in such circumstances. */
2531 const uschar *oldptr = ptr;
2532 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2534 /* \b is backslash; \X is literal X; any other special means the '-'
2539 if (d == -ESC_b) d = '\b';
2540 else if (d == -ESC_X) d = 'X'; else
2543 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2548 /* The check that the two values are in the correct order happens in
2549 the pre-pass. Optimize one-character ranges */
2551 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2553 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2554 matching, we have to use an XCLASS with extra data items. Caseless
2555 matching for characters > 127 is available only if UCP support is
2559 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2563 /* With UCP support, we can find the other case equivalents of
2564 the relevant characters. There may be several ranges. Optimize how
2565 they fit with the basic range. */
2568 if ((options & PCRE_CASELESS) != 0)
2573 while (get_othercase_range(&cc, origd, &occ, &ocd))
2575 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2577 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2578 { /* if there is overlap, */
2579 c = occ; /* noting that if occ < c */
2580 continue; /* we can't have ocd > d */
2581 } /* because a subrange is */
2582 if (ocd > d && occ <= d + 1) /* always shorter than */
2583 { /* the basic range. */
2590 *class_utf8data++ = XCL_SINGLE;
2594 *class_utf8data++ = XCL_RANGE;
2595 class_utf8data += ord2utf8(occ, class_utf8data);
2597 class_utf8data += ord2utf8(ocd, class_utf8data);
2600 #endif /* SUPPORT_UCP */
2602 /* Now record the original range, possibly modified for UCP caseless
2603 overlapping ranges. */
2605 *class_utf8data++ = XCL_RANGE;
2606 class_utf8data += ord2utf8(c, class_utf8data);
2607 class_utf8data += ord2utf8(d, class_utf8data);
2609 /* With UCP support, we are done. Without UCP support, there is no
2610 caseless matching for UTF-8 characters > 127; we can use the bit map
2611 for the smaller ones. */
2614 continue; /* With next character in the class */
2616 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2618 /* Adjust upper limit and fall through to set up the map */
2622 #endif /* SUPPORT_UCP */
2624 #endif /* SUPPORT_UTF8 */
2626 /* We use the bit map for all cases when not in UTF-8 mode; else
2627 ranges that lie entirely within 0-127 when there is UCP support; else
2628 for partial ranges without UCP support. */
2632 classbits[c/8] |= (1 << (c&7));
2633 if ((options & PCRE_CASELESS) != 0)
2635 int uc = cd->fcc[c]; /* flip case */
2636 classbits[uc/8] |= (1 << (uc&7));
2638 class_charcount++; /* in case a one-char range */
2642 continue; /* Go get the next char in the class */
2645 /* Handle a lone single character - we can get here for a normal
2646 non-escape char, or after \ that introduces a single character or for an
2647 apparent range that isn't. */
2649 LONE_SINGLE_CHARACTER:
2651 /* Handle a character that cannot go in the bit map */
2654 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2657 *class_utf8data++ = XCL_SINGLE;
2658 class_utf8data += ord2utf8(c, class_utf8data);
2661 if ((options & PCRE_CASELESS) != 0)
2665 if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2667 *class_utf8data++ = XCL_SINGLE;
2668 class_utf8data += ord2utf8(othercase, class_utf8data);
2671 #endif /* SUPPORT_UCP */
2675 #endif /* SUPPORT_UTF8 */
2677 /* Handle a single-byte character */
2679 classbits[c/8] |= (1 << (c&7));
2680 if ((options & PCRE_CASELESS) != 0)
2682 c = cd->fcc[c]; /* flip case */
2683 classbits[c/8] |= (1 << (c&7));
2690 /* Loop until ']' reached; the check for end of string happens inside the
2691 loop. This "while" is the end of the "do" above. */
2693 while ((c = *(++ptr)) != ']' || inescq);
2695 /* If class_charcount is 1, we saw precisely one character whose value is
2696 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2697 can optimize the negative case only if there were no characters >= 128
2698 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2699 single-bytes only. This is an historical hangover. Maybe one day we can
2700 tidy these opcodes to handle multi-byte characters.
2702 The optimization throws away the bit map. We turn the item into a
2703 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2704 that OP_NOT does not support multibyte characters. In the positive case, it
2705 can cause firstbyte to be set. Otherwise, there can be no first char if
2706 this item is first, whatever repeat count may follow. In the case of
2707 reqbyte, save the previous value for reinstating. */
2710 if (class_charcount == 1 &&
2712 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2715 if (class_charcount == 1)
2718 zeroreqbyte = reqbyte;
2720 /* The OP_NOT opcode works on one-byte characters only. */
2724 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2725 zerofirstbyte = firstbyte;
2727 *code++ = class_lastchar;
2731 /* For a single, positive character, get the value into mcbuffer, and
2732 then we can handle this with the normal one-character code. */
2735 if (utf8 && class_lastchar > 127)
2736 mclength = ord2utf8(class_lastchar, mcbuffer);
2740 mcbuffer[0] = class_lastchar;
2744 } /* End of 1-char optimization */
2746 /* The general case - not the one-char optimization. If this is the first
2747 thing in the branch, there can be no first char setting, whatever the
2748 repeat count. Any reqbyte setting must remain unchanged after any kind of
2751 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2752 zerofirstbyte = firstbyte;
2753 zeroreqbyte = reqbyte;
2755 /* If there are characters with values > 255, we have to compile an
2756 extended class, with its own opcode. If there are no characters < 256,
2757 we can omit the bitmap. */
2762 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2763 *code++ = OP_XCLASS;
2765 *code = negate_class? XCL_NOT : 0;
2767 /* If the map is required, install it, and move on to the end of
2770 if (class_charcount > 0)
2773 memcpy(code, classbits, 32);
2774 code = class_utf8data;
2777 /* If the map is not required, slide down the extra data. */
2781 int len = class_utf8data - (code + 33);
2782 memmove(code + 1, code + 33, len);
2786 /* Now fill in the complete length of the item */
2788 PUT(previous, 1, code - previous);
2789 break; /* End of class handling */
2793 /* If there are no characters > 255, negate the 32-byte map if necessary,
2794 and copy it into the code vector. If this is the first thing in the branch,
2795 there can be no first char setting, whatever the repeat count. Any reqbyte
2796 setting must remain unchanged after any kind of repeat. */
2800 *code++ = OP_NCLASS;
2801 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2806 memcpy(code, classbits, 32);
2811 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2812 has been tested above. */
2815 if (!is_quantifier) goto NORMAL_CHAR;
2816 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2817 if (*errorptr != NULL) goto FAILED;
2835 if (previous == NULL)
2841 if (repeat_min == 0)
2843 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2844 reqbyte = zeroreqbyte; /* Ditto */
2847 /* Remember whether this is a variable length repeat */
2849 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2851 op_type = 0; /* Default single-char op codes */
2852 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2854 /* Save start of previous item, in case we have to move it up to make space
2855 for an inserted OP_ONCE for the additional '+' extension. */
2857 tempcode = previous;
2859 /* If the next character is '+', we have a possessive quantifier. This
2860 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2861 If the next character is '?' this is a minimizing repeat, by default,
2862 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2863 repeat type to the non-default. */
2867 repeat_type = 0; /* Force greedy */
2868 possessive_quantifier = TRUE;
2871 else if (ptr[1] == '?')
2873 repeat_type = greedy_non_default;
2876 else repeat_type = greedy_default;
2878 /* If previous was a recursion, we need to wrap it inside brackets so that
2879 it can be replicated if necessary. */
2881 if (*previous == OP_RECURSE)
2883 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2884 code += 1 + LINK_SIZE;
2886 PUT(previous, 1, code - previous);
2888 PUT(code, 1, code - previous);
2889 code += 1 + LINK_SIZE;
2892 /* If previous was a character match, abolish the item and generate a
2893 repeat item instead. If a char item has a minumum of more than one, ensure
2894 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2895 the first thing in a branch because the x will have gone into firstbyte
2898 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2900 /* Deal with UTF-8 characters that take up more than one byte. It's
2901 easier to write this out separately than try to macrify it. Use c to
2902 hold the length of the character in bytes, plus 0x80 to flag that it's a
2903 length rather than a small character. */
2906 if (utf8 && (code[-1] & 0x80) != 0)
2908 uschar *lastchar = code - 1;
2909 while((*lastchar & 0xc0) == 0x80) lastchar--;
2910 c = code - lastchar; /* Length of UTF-8 character */
2911 memcpy(utf8_char, lastchar, c); /* Save the char */
2912 c |= 0x80; /* Flag c as a length */
2917 /* Handle the case of a single byte - either with no UTF8 support, or
2918 with UTF-8 disabled, or for a UTF-8 character < 128. */
2922 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2925 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2928 /* If previous was a single negated character ([^a] or similar), we use
2929 one of the special opcodes, replacing it. The code is shared with single-
2930 character repeats by setting opt_type to add a suitable offset into
2931 repeat_type. OP_NOT is currently used only for single-byte chars. */
2933 else if (*previous == OP_NOT)
2935 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2937 goto OUTPUT_SINGLE_REPEAT;
2940 /* If previous was a character type match (\d or similar), abolish it and
2941 create a suitable repeat item. The code is shared with single-character
2942 repeats by setting op_type to add a suitable offset into repeat_type. Note
2943 the the Unicode property types will be present only when SUPPORT_UCP is
2944 defined, but we don't wrap the little bits of code here because it just
2945 makes it horribly messy. */
2947 else if (*previous < OP_EODN)
2951 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2954 OUTPUT_SINGLE_REPEAT:
2955 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2959 code = previous; /* Usually overwrite previous item */
2961 /* If the maximum is zero then the minimum must also be zero; Perl allows
2962 this case, so we do too - by simply omitting the item altogether. */
2964 if (repeat_max == 0) goto END_REPEAT;
2966 /* All real repeats make it impossible to handle partial matching (maybe
2967 one day we will be able to remove this restriction). */
2969 if (repeat_max != 1) cd->nopartial = TRUE;
2971 /* Combine the op_type with the repeat_type */
2973 repeat_type += op_type;
2975 /* A minimum of zero is handled either as the special case * or ?, or as
2976 an UPTO, with the maximum given. */
2978 if (repeat_min == 0)
2980 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2981 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2984 *code++ = OP_UPTO + repeat_type;
2985 PUT2INC(code, 0, repeat_max);
2989 /* A repeat minimum of 1 is optimized into some special cases. If the
2990 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2991 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2992 one less than the maximum. */
2994 else if (repeat_min == 1)
2996 if (repeat_max == -1)
2997 *code++ = OP_PLUS + repeat_type;
3000 code = oldcode; /* leave previous item in place */
3001 if (repeat_max == 1) goto END_REPEAT;
3002 *code++ = OP_UPTO + repeat_type;
3003 PUT2INC(code, 0, repeat_max - 1);
3007 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3008 handled as an EXACT followed by an UPTO. */
3012 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3013 PUT2INC(code, 0, repeat_min);
3015 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3016 we have to insert the character for the previous code. For a repeated
3017 Unicode property match, there is an extra byte that defines the
3018 required property. In UTF-8 mode, long characters have their length in
3019 c, with the 0x80 bit as a flag. */
3024 if (utf8 && c >= 128)
3026 memcpy(code, utf8_char, c & 7);
3033 if (prop_type >= 0) *code++ = prop_type;
3035 *code++ = OP_STAR + repeat_type;
3038 /* Else insert an UPTO if the max is greater than the min, again
3039 preceded by the character, for the previously inserted code. */
3041 else if (repeat_max != repeat_min)
3044 if (utf8 && c >= 128)
3046 memcpy(code, utf8_char, c & 7);
3052 if (prop_type >= 0) *code++ = prop_type;
3053 repeat_max -= repeat_min;
3054 *code++ = OP_UPTO + repeat_type;
3055 PUT2INC(code, 0, repeat_max);
3059 /* The character or character type itself comes last in all cases. */
3062 if (utf8 && c >= 128)
3064 memcpy(code, utf8_char, c & 7);
3071 /* For a repeated Unicode property match, there is an extra byte that
3072 defines the required property. */
3075 if (prop_type >= 0) *code++ = prop_type;
3079 /* If previous was a character class or a back reference, we put the repeat
3080 stuff after it, but just skip the item if the repeat was {0,0}. */
3082 else if (*previous == OP_CLASS ||
3083 *previous == OP_NCLASS ||
3085 *previous == OP_XCLASS ||
3087 *previous == OP_REF)
3089 if (repeat_max == 0)
3095 /* All real repeats make it impossible to handle partial matching (maybe
3096 one day we will be able to remove this restriction). */
3098 if (repeat_max != 1) cd->nopartial = TRUE;
3100 if (repeat_min == 0 && repeat_max == -1)
3101 *code++ = OP_CRSTAR + repeat_type;
3102 else if (repeat_min == 1 && repeat_max == -1)
3103 *code++ = OP_CRPLUS + repeat_type;
3104 else if (repeat_min == 0 && repeat_max == 1)
3105 *code++ = OP_CRQUERY + repeat_type;
3108 *code++ = OP_CRRANGE + repeat_type;
3109 PUT2INC(code, 0, repeat_min);
3110 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3111 PUT2INC(code, 0, repeat_max);
3115 /* If previous was a bracket group, we may have to replicate it in certain
3118 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3119 *previous == OP_COND)
3123 int len = code - previous;
3124 uschar *bralink = NULL;
3126 /* If the maximum repeat count is unlimited, find the end of the bracket
3127 by scanning through from the start, and compute the offset back to it
3128 from the current code pointer. There may be an OP_OPT setting following
3129 the final KET, so we can't find the end just by going back from the code
3132 if (repeat_max == -1)
3134 register uschar *ket = previous;
3135 do ket += GET(ket, 1); while (*ket != OP_KET);
3136 ketoffset = code - ket;
3139 /* The case of a zero minimum is special because of the need to stick
3140 OP_BRAZERO in front of it, and because the group appears once in the
3141 data, whereas in other cases it appears the minimum number of times. For
3142 this reason, it is simplest to treat this case separately, as otherwise
3143 the code gets far too messy. There are several special subcases when the
3146 if (repeat_min == 0)
3148 /* If the maximum is also zero, we just omit the group from the output
3151 if (repeat_max == 0)
3157 /* If the maximum is 1 or unlimited, we just have to stick in the
3158 BRAZERO and do no more at this point. However, we do need to adjust
3159 any OP_RECURSE calls inside the group that refer to the group itself or
3160 any internal group, because the offset is from the start of the whole
3161 regex. Temporarily terminate the pattern while doing this. */
3163 if (repeat_max <= 1)
3166 adjust_recurse(previous, 1, utf8, cd);
3167 memmove(previous+1, previous, len);
3169 *previous++ = OP_BRAZERO + repeat_type;
3172 /* If the maximum is greater than 1 and limited, we have to replicate
3173 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3174 The first one has to be handled carefully because it's the original
3175 copy, which has to be moved up. The remainder can be handled by code
3176 that is common with the non-zero minimum case below. We have to
3177 adjust the value or repeat_max, since one less copy is required. Once
3178 again, we may have to adjust any OP_RECURSE calls inside the group. */
3184 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3185 memmove(previous + 2 + LINK_SIZE, previous, len);
3186 code += 2 + LINK_SIZE;
3187 *previous++ = OP_BRAZERO + repeat_type;
3188 *previous++ = OP_BRA;
3190 /* We chain together the bracket offset fields that have to be
3191 filled in later when the ends of the brackets are reached. */
3193 offset = (bralink == NULL)? 0 : previous - bralink;
3195 PUTINC(previous, 0, offset);
3201 /* If the minimum is greater than zero, replicate the group as many
3202 times as necessary, and adjust the maximum to the number of subsequent
3203 copies that we need. If we set a first char from the group, and didn't
3204 set a required char, copy the latter from the former. */
3210 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3211 for (i = 1; i < repeat_min; i++)
3213 memcpy(code, previous, len);
3217 if (repeat_max > 0) repeat_max -= repeat_min;
3220 /* This code is common to both the zero and non-zero minimum cases. If
3221 the maximum is limited, it replicates the group in a nested fashion,
3222 remembering the bracket starts on a stack. In the case of a zero minimum,
3223 the first one was set up above. In all cases the repeat_max now specifies
3224 the number of additional copies needed. */
3226 if (repeat_max >= 0)
3228 for (i = repeat_max - 1; i >= 0; i--)
3230 *code++ = OP_BRAZERO + repeat_type;
3232 /* All but the final copy start a new nesting, maintaining the
3233 chain of brackets outstanding. */
3239 offset = (bralink == NULL)? 0 : code - bralink;
3241 PUTINC(code, 0, offset);
3244 memcpy(code, previous, len);
3248 /* Now chain through the pending brackets, and fill in their length
3249 fields (which are holding the chain links pro tem). */
3251 while (bralink != NULL)
3254 int offset = code - bralink + 1;
3255 uschar *bra = code - offset;
3256 oldlinkoffset = GET(bra, 1);
3257 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3259 PUTINC(code, 0, offset);
3260 PUT(bra, 1, offset);
3264 /* If the maximum is unlimited, set a repeater in the final copy. We
3265 can't just offset backwards from the current code point, because we
3266 don't know if there's been an options resetting after the ket. The
3267 correct offset was computed above. */
3269 else code[-ketoffset] = OP_KETRMAX + repeat_type;
3272 /* Else there's some kind of shambles */
3280 /* If the character following a repeat is '+', we wrap the entire repeated
3281 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3282 Sun's Java package. The repeated item starts at tempcode, not at previous,
3283 which might be the first part of a string whose (former) last char we
3284 repeated. However, we don't support '+' after a greediness '?'. */
3286 if (possessive_quantifier)
3288 int len = code - tempcode;
3289 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3290 code += 1 + LINK_SIZE;
3291 len += 1 + LINK_SIZE;
3292 tempcode[0] = OP_ONCE;
3294 PUTINC(code, 0, len);
3295 PUT(tempcode, 1, len);
3298 /* In all case we no longer have a previous item. We also set the
3299 "follows varying string" flag for subsequently encountered reqbytes if
3300 it isn't already set and we have just passed a varying length item. */
3304 cd->req_varyopt |= reqvary;
3308 /* Start of nested bracket sub-expression, or comment or lookahead or
3309 lookbehind or option setting or condition. First deal with special things
3310 that can come after a bracket; all are introduced by ?, and the appearance
3311 of any of them means that this is not a referencing group. They were
3312 checked for validity in the first pass over the string, so we don't have to
3313 check for syntax errors here. */
3316 newoptions = options;
3319 if (*(++ptr) == '?')
3326 case '#': /* Comment; skip to ket */
3328 while (*ptr != ')') ptr++;
3331 case ':': /* Non-extracting bracket */
3337 bravalue = OP_COND; /* Conditional group */
3339 /* Condition to test for recursion */
3343 code[1+LINK_SIZE] = OP_CREF;
3344 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3349 /* Condition to test for a numbered subpattern match. We know that
3350 if a digit follows ( then there will just be digits until ) because
3351 the syntax was checked in the first pass. */
3353 else if ((digitab[ptr[1]] && ctype_digit) != 0)
3355 int condref; /* Don't amalgamate; some compilers */
3356 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
3357 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3364 code[1+LINK_SIZE] = OP_CREF;
3365 PUT2(code, 2+LINK_SIZE, condref);
3368 /* For conditions that are assertions, we just fall through, having
3369 set bravalue above. */
3372 case '=': /* Positive lookahead */
3373 bravalue = OP_ASSERT;
3377 case '!': /* Negative lookahead */
3378 bravalue = OP_ASSERT_NOT;
3382 case '<': /* Lookbehinds */
3385 case '=': /* Positive lookbehind */
3386 bravalue = OP_ASSERTBACK;
3390 case '!': /* Negative lookbehind */
3391 bravalue = OP_ASSERTBACK_NOT;
3397 case '>': /* One-time brackets */
3402 case 'C': /* Callout - may be followed by digits; */
3403 previous_callout = code; /* Save for later completion */
3404 after_manual_callout = 1; /* Skip one item before completing */
3405 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3406 { /* closing parenthesis is present. */
3408 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3409 n = n * 10 + *ptr - '0';
3416 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3417 PUT(code, LINK_SIZE, 0); /* Default length */
3418 code += 2 * LINK_SIZE;
3423 case 'P': /* Named subpattern handling */
3424 if (*(++ptr) == '<') /* Definition */
3427 uschar *slot = cd->name_table;
3428 const uschar *name; /* Don't amalgamate; some compilers */
3429 name = ++ptr; /* grumble at autoincrement in declaration */
3431 while (*ptr++ != '>');
3432 namelen = ptr - name - 1;
3434 for (i = 0; i < cd->names_found; i++)
3436 int crc = memcmp(name, slot+2, namelen);
3439 if (slot[2+namelen] == 0)
3444 crc = -1; /* Current name is substring */
3448 memmove(slot + cd->name_entry_size, slot,
3449 (cd->names_found - i) * cd->name_entry_size);
3452 slot += cd->name_entry_size;
3455 PUT2(slot, 0, *brackets + 1);
3456 memcpy(slot + 2, name, namelen);
3457 slot[2+namelen] = 0;
3459 goto NUMBERED_GROUP;
3462 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3466 const uschar *name = ptr;
3467 uschar *slot = cd->name_table;
3469 while (*ptr != ')') ptr++;
3470 namelen = ptr - name;
3472 for (i = 0; i < cd->names_found; i++)
3474 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3475 slot += cd->name_entry_size;
3477 if (i >= cd->names_found)
3483 recno = GET2(slot, 0);
3485 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3487 /* Back reference */
3491 PUT2INC(code, 0, recno);
3492 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3493 if (recno > cd->top_backref) cd->top_backref = recno;
3497 /* Should never happen */
3500 case 'R': /* Pattern recursion */
3501 ptr++; /* Same as (?0) */
3504 /* Recursion or "subroutine" call */
3506 case '0': case '1': case '2': case '3': case '4':
3507 case '5': case '6': case '7': case '8': case '9':
3509 const uschar *called;
3511 while((digitab[*ptr] & ctype_digit) != 0)
3512 recno = recno * 10 + *ptr++ - '0';
3514 /* Come here from code above that handles a named recursion */
3520 /* Find the bracket that is being referenced. Temporarily end the
3521 regex in case it doesn't exist. */
3524 called = (recno == 0)?
3525 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3533 /* If the subpattern is still open, this is a recursive call. We
3534 check to see if this is a left recursion that could loop for ever,
3535 and diagnose that case. */
3537 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3543 /* Insert the recursion/subroutine item */
3546 PUT(code, 1, called - cd->start_code);
3547 code += 1 + LINK_SIZE;
3551 /* Character after (? not specially recognized */
3553 default: /* Option setting */
3557 while (*ptr != ')' && *ptr != ':')
3561 case '-': optset = &unset; break;
3563 case 'i': *optset |= PCRE_CASELESS; break;
3564 case 'm': *optset |= PCRE_MULTILINE; break;
3565 case 's': *optset |= PCRE_DOTALL; break;
3566 case 'x': *optset |= PCRE_EXTENDED; break;
3567 case 'U': *optset |= PCRE_UNGREEDY; break;
3568 case 'X': *optset |= PCRE_EXTRA; break;
3572 /* Set up the changed option bits, but don't change anything yet. */
3574 newoptions = (options | set) & (~unset);
3576 /* If the options ended with ')' this is not the start of a nested
3577 group with option changes, so the options change at this level. Compile
3578 code to change the ims options if this setting actually changes any of
3579 them. We also pass the new setting back so that it can be put at the
3580 start of any following branches, and when this group ends (if we are in
3581 a group), a resetting item can be compiled.
3583 Note that if this item is right at the start of the pattern, the
3584 options will have been abstracted and made global, so there will be no
3585 change to compile. */
3589 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3592 *code++ = newoptions & PCRE_IMS;
3595 /* Change options at this level, and pass them back for use
3596 in subsequent branches. Reset the greedy defaults and the case
3597 value for firstbyte and reqbyte. */
3599 *optionsptr = options = newoptions;
3600 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3601 greedy_non_default = greedy_default ^ 1;
3602 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3604 previous = NULL; /* This item can't be repeated */
3605 continue; /* It is complete */
3608 /* If the options ended with ':' we are heading into a nested group
3609 with possible change of options. Such groups are non-capturing and are
3610 not assertions of any kind. All we need to do is skip over the ':';
3611 the newoptions value is handled below. */
3618 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3619 non-capturing and behave like (?:...) brackets */
3621 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3626 /* Else we have a referencing group; adjust the opcode. If the bracket
3627 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3628 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3633 if (++(*brackets) > EXTRACT_BASIC_MAX)
3635 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3636 code[1+LINK_SIZE] = OP_BRANUMBER;
3637 PUT2(code, 2+LINK_SIZE, *brackets);
3640 else bravalue = OP_BRA + *brackets;
3643 /* Process nested bracketed re. Assertions may not be repeated, but other
3644 kinds can be. We copy code into a non-register variable in order to be able
3645 to pass its address because some compilers complain otherwise. Pass in a
3646 new setting for the ims options if they have changed. */
3648 previous = (bravalue >= OP_ONCE)? code : NULL;
3651 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3654 newoptions, /* The complete new option state */
3655 options & PCRE_IMS, /* The previous ims option state */
3656 brackets, /* Extracting bracket count */
3657 &tempcode, /* Where to put code (updated) */
3658 &ptr, /* Input pointer (updated) */
3659 errorptr, /* Where to put an error message */
3660 (bravalue == OP_ASSERTBACK ||
3661 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3662 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3663 &subfirstbyte, /* For possible first char */
3664 &subreqbyte, /* For possible last char */
3665 bcptr, /* Current branch chain */
3666 cd)) /* Tables block */
3669 /* At the end of compiling, code is still pointing to the start of the
3670 group, while tempcode has been updated to point past the end of the group
3671 and any option resetting that may follow it. The pattern pointer (ptr)
3672 is on the bracket. */
3674 /* If this is a conditional bracket, check that there are no more than
3675 two branches in the group. */
3677 else if (bravalue == OP_COND)
3686 while (*tc != OP_KET);
3694 /* If there is just one branch, we must not make use of its firstbyte or
3695 reqbyte, because this is equivalent to an empty second branch. */
3697 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3700 /* Handle updating of the required and first characters. Update for normal
3701 brackets of all kinds, and conditions with two branches (see code above).
3702 If the bracket is followed by a quantifier with zero repeat, we have to
3703 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3704 main loop so that they can be accessed for the back off. */
3706 zeroreqbyte = reqbyte;
3707 zerofirstbyte = firstbyte;
3708 groupsetfirstbyte = FALSE;
3710 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3712 /* If we have not yet set a firstbyte in this branch, take it from the
3713 subpattern, remembering that it was set here so that a repeat of more
3714 than one can replicate it as reqbyte if necessary. If the subpattern has
3715 no firstbyte, set "none" for the whole branch. In both cases, a zero
3716 repeat forces firstbyte to "none". */
3718 if (firstbyte == REQ_UNSET)
3720 if (subfirstbyte >= 0)
3722 firstbyte = subfirstbyte;
3723 groupsetfirstbyte = TRUE;
3725 else firstbyte = REQ_NONE;
3726 zerofirstbyte = REQ_NONE;
3729 /* If firstbyte was previously set, convert the subpattern's firstbyte
3730 into reqbyte if there wasn't one, using the vary flag that was in
3731 existence beforehand. */
3733 else if (subfirstbyte >= 0 && subreqbyte < 0)
3734 subreqbyte = subfirstbyte | tempreqvary;
3736 /* If the subpattern set a required byte (or set a first byte that isn't
3737 really the first byte - see above), set it. */
3739 if (subreqbyte >= 0) reqbyte = subreqbyte;
3742 /* For a forward assertion, we take the reqbyte, if set. This can be
3743 helpful if the pattern that follows the assertion doesn't set a different
3744 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3745 for an assertion, however because it leads to incorrect effect for patterns
3746 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3747 of a firstbyte. This is overcome by a scan at the end if there's no
3748 firstbyte, looking for an asserted first char. */
3750 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3752 /* Now update the main code pointer to the end of the group. */
3756 /* Error if hit end of pattern */
3765 /* Check \ for being a real metacharacter; if not, fall through and handle
3766 it as a data character at the start of a string. Escape items are checked
3767 for validity in the pre-compiling pass. */
3771 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3773 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3774 are arranged to be the negation of the corresponding OP_values. For the
3775 back references, the values are ESC_REF plus the reference number. Only
3776 back references and those types that consume a character may be repeated.
3777 We can test for values between ESC_b and ESC_Z for the latter; this may
3778 have to change if any new ones are ever created. */
3782 if (-c == ESC_Q) /* Handle start of quoted string */
3784 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3789 /* For metasequences that actually match a character, we disable the
3790 setting of a first character if it hasn't already been set. */
3792 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3793 firstbyte = REQ_NONE;
3795 /* Set values to reset to if this is followed by a zero repeat. */
3797 zerofirstbyte = firstbyte;
3798 zeroreqbyte = reqbyte;
3800 /* Back references are handled specially */
3804 int number = -c - ESC_REF;
3807 PUT2INC(code, 0, number);
3810 /* So are Unicode property matches, if supported. We know that get_ucp
3811 won't fail because it was tested in the pre-pass. */
3814 else if (-c == ESC_P || -c == ESC_p)
3817 int value = get_ucp(&ptr, &negated, errorptr);
3819 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3824 /* For the rest, we can obtain the OP value by negating the escape
3829 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3835 /* We have a data character whose value is in c. In UTF-8 mode it may have
3836 a value > 127. We set its representation in the length/buffer, and then
3837 handle it as a data character. */
3840 if (utf8 && c > 127)
3841 mclength = ord2utf8(c, mcbuffer);
3852 /* Handle a literal character. It is guaranteed not to be whitespace or #
3853 when the extended flag is set. If we are in UTF-8 mode, it may be a
3854 multi-byte literal character. */
3862 if (utf8 && (c & 0xc0) == 0xc0)
3864 while ((ptr[1] & 0xc0) == 0x80)
3865 mcbuffer[mclength++] = *(++ptr);
3869 /* At this point we have the character's bytes in mcbuffer, and the length
3870 in mclength. When not in UTF-8 mode, the length is always 1. */
3874 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3875 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3877 /* Set the first and required bytes appropriately. If no previous first
3878 byte, set it from this character, but revert to none on a zero repeat.
3879 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3882 if (firstbyte == REQ_UNSET)
3884 zerofirstbyte = REQ_NONE;
3885 zeroreqbyte = reqbyte;
3887 /* If the character is more than one byte long, we can set firstbyte
3888 only if it is not to be matched caselessly. */
3890 if (mclength == 1 || req_caseopt == 0)
3892 firstbyte = mcbuffer[0] | req_caseopt;
3893 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3895 else firstbyte = reqbyte = REQ_NONE;
3898 /* firstbyte was previously set; we can set reqbyte only the length is
3899 1 or the matching is caseful. */
3903 zerofirstbyte = firstbyte;
3904 zeroreqbyte = reqbyte;
3905 if (mclength == 1 || req_caseopt == 0)
3906 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3909 break; /* End of literal character handling */
3911 } /* end of big loop */
3913 /* Control never reaches here by falling through, only by a goto for all the
3914 error states. Pass back the position in the pattern so that it can be displayed
3915 to the user for diagnosing the error. */
3925 /*************************************************
3926 * Compile sequence of alternatives *
3927 *************************************************/
3929 /* On entry, ptr is pointing past the bracket character, but on return
3930 it points to the closing bracket, or vertical bar, or end of string.
3931 The code variable is pointing at the byte into which the BRA operator has been
3932 stored. If the ims options are changed at the start (for a (?ims: group) or
3933 during any branch, we need to insert an OP_OPT item at the start of every
3934 following branch to ensure they get set correctly at run time, and also pass
3935 the new options into every subsequent branch compile.
3938 options option bits, including any changes for this subpattern
3939 oldims previous settings of ims option bits
3940 brackets -> int containing the number of extracting brackets used
3941 codeptr -> the address of the current code pointer
3942 ptrptr -> the address of the current pattern pointer
3943 errorptr -> pointer to error message
3944 lookbehind TRUE if this is a lookbehind assertion
3945 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3946 firstbyteptr place to put the first required character, or a negative number
3947 reqbyteptr place to put the last required character, or a negative number
3948 bcptr pointer to the chain of currently open branches
3949 cd points to the data block with tables pointers etc.
3951 Returns: TRUE on success
3955 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3956 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3957 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3959 const uschar *ptr = *ptrptr;
3960 uschar *code = *codeptr;
3961 uschar *last_branch = code;
3962 uschar *start_bracket = code;
3963 uschar *reverse_count = NULL;
3964 int firstbyte, reqbyte;
3965 int branchfirstbyte, branchreqbyte;
3971 firstbyte = reqbyte = REQ_UNSET;
3973 /* Offset is set zero to mark that this bracket is still open */
3976 code += 1 + LINK_SIZE + skipbytes;
3978 /* Loop for each alternative branch */
3982 /* Handle a change of ims options at the start of the branch */
3984 if ((options & PCRE_IMS) != oldims)
3987 *code++ = options & PCRE_IMS;
3990 /* Set up dummy OP_REVERSE if lookbehind assertion */
3994 *code++ = OP_REVERSE;
3995 reverse_count = code;
3999 /* Now compile the branch */
4001 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4002 &branchfirstbyte, &branchreqbyte, &bc, cd))
4008 /* If this is the first branch, the firstbyte and reqbyte values for the
4009 branch become the values for the regex. */
4011 if (*last_branch != OP_ALT)
4013 firstbyte = branchfirstbyte;
4014 reqbyte = branchreqbyte;
4017 /* If this is not the first branch, the first char and reqbyte have to
4018 match the values from all the previous branches, except that if the previous
4019 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4020 REQ_VARY for the regex. */
4024 /* If we previously had a firstbyte, but it doesn't match the new branch,
4025 we have to abandon the firstbyte for the regex, but if there was previously
4026 no reqbyte, it takes on the value of the old firstbyte. */
4028 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4030 if (reqbyte < 0) reqbyte = firstbyte;
4031 firstbyte = REQ_NONE;
4034 /* If we (now or from before) have no firstbyte, a firstbyte from the
4035 branch becomes a reqbyte if there isn't a branch reqbyte. */
4037 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4038 branchreqbyte = branchfirstbyte;
4040 /* Now ensure that the reqbytes match */
4042 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4044 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4047 /* If lookbehind, check that this branch matches a fixed-length string,
4048 and put the length into the OP_REVERSE item. Temporarily mark the end of
4049 the branch with OP_END. */
4055 length = find_fixedlength(last_branch, options);
4056 DPRINTF(("fixed length = %d\n", length));
4059 *errorptr = (length == -2)? ERR36 : ERR25;
4063 PUT(reverse_count, 0, length);
4066 /* Reached end of expression, either ')' or end of pattern. Go back through
4067 the alternative branches and reverse the chain of offsets, with the field in
4068 the BRA item now becoming an offset to the first alternative. If there are
4069 no alternatives, it points to the end of the group. The length in the
4070 terminating ket is always the length of the whole bracketed item. If any of
4071 the ims options were changed inside the group, compile a resetting op-code
4072 following, except at the very end of the pattern. Return leaving the pointer
4073 at the terminating char. */
4077 int length = code - last_branch;
4080 int prev_length = GET(last_branch, 1);
4081 PUT(last_branch, 1, length);
4082 length = prev_length;
4083 last_branch -= length;
4087 /* Fill in the ket */
4090 PUT(code, 1, code - start_bracket);
4091 code += 1 + LINK_SIZE;
4093 /* Resetting option if needed */
4095 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4101 /* Set values to pass back */
4105 *firstbyteptr = firstbyte;
4106 *reqbyteptr = reqbyte;
4110 /* Another branch follows; insert an "or" node. Its length field points back
4111 to the previous branch while the bracket remains open. At the end the chain
4112 is reversed. It's done like this so that the start of the bracket has a
4113 zero offset until it is closed, making it possible to detect recursion. */
4116 PUT(code, 1, code - last_branch);
4117 bc.current = last_branch = code;
4118 code += 1 + LINK_SIZE;
4121 /* Control never reaches here */
4127 /*************************************************
4128 * Check for anchored expression *
4129 *************************************************/
4131 /* Try to find out if this is an anchored regular expression. Consider each
4132 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4133 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4134 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4135 counts, since OP_CIRC can match in the middle.
4137 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4138 This is the code for \G, which means "match at start of match position, taking
4139 into account the match offset".
4141 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4142 because that will try the rest of the pattern at all possible matching points,
4143 so there is no point trying again.... er ....
4145 .... except when the .* appears inside capturing parentheses, and there is a
4146 subsequent back reference to those parentheses. We haven't enough information
4147 to catch that case precisely.
4149 At first, the best we could do was to detect when .* was in capturing brackets
4150 and the highest back reference was greater than or equal to that level.
4151 However, by keeping a bitmap of the first 31 back references, we can catch some
4152 of the more common cases more precisely.
4155 code points to start of expression (the bracket)
4156 options points to the options setting
4157 bracket_map a bitmap of which brackets we are inside while testing; this
4158 handles up to substring 31; after that we just have to take
4159 the less precise approach
4160 backref_map the back reference bitmap
4162 Returns: TRUE or FALSE
4166 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4167 unsigned int backref_map)
4170 const uschar *scode =
4171 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4172 register int op = *scode;
4174 /* Capturing brackets */
4180 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4181 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4182 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4185 /* Other brackets */
4187 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4189 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4192 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4193 are or may be referenced. */
4195 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4196 (*options & PCRE_DOTALL) != 0)
4198 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4201 /* Check for explicit anchoring */
4203 else if (op != OP_SOD && op != OP_SOM &&
4204 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4206 code += GET(code, 1);
4208 while (*code == OP_ALT); /* Loop for each alternative */
4214 /*************************************************
4215 * Check for starting with ^ or .* *
4216 *************************************************/
4218 /* This is called to find out if every branch starts with ^ or .* so that
4219 "first char" processing can be done to speed things up in multiline
4220 matching and for non-DOTALL patterns that start with .* (which must start at
4221 the beginning or after \n). As in the case of is_anchored() (see above), we
4222 have to take account of back references to capturing brackets that contain .*
4223 because in that case we can't make the assumption.
4226 code points to start of expression (the bracket)
4227 bracket_map a bitmap of which brackets we are inside while testing; this
4228 handles up to substring 31; after that we just have to take
4229 the less precise approach
4230 backref_map the back reference bitmap
4232 Returns: TRUE or FALSE
4236 is_startline(const uschar *code, unsigned int bracket_map,
4237 unsigned int backref_map)
4240 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4242 register int op = *scode;
4244 /* Capturing brackets */
4250 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4251 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4252 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4255 /* Other brackets */
4257 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4258 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4260 /* .* means "start at start or after \n" if it isn't in brackets that
4261 may be referenced. */
4263 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4265 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4268 /* Check for explicit circumflex */
4270 else if (op != OP_CIRC) return FALSE;
4272 /* Move on to the next alternative */
4274 code += GET(code, 1);
4276 while (*code == OP_ALT); /* Loop for each alternative */
4282 /*************************************************
4283 * Check for asserted fixed first char *
4284 *************************************************/
4286 /* During compilation, the "first char" settings from forward assertions are
4287 discarded, because they can cause conflicts with actual literals that follow.
4288 However, if we end up without a first char setting for an unanchored pattern,
4289 it is worth scanning the regex to see if there is an initial asserted first
4290 char. If all branches start with the same asserted char, or with a bracket all
4291 of whose alternatives start with the same asserted char (recurse ad lib), then
4292 we return that char, otherwise -1.
4295 code points to start of expression (the bracket)
4296 options pointer to the options (used to check casing changes)
4297 inassert TRUE if in an assertion
4299 Returns: -1 or the fixed first char
4303 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4305 register int c = -1;
4308 const uschar *scode =
4309 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4310 register int op = *scode;
4312 if (op >= OP_BRA) op = OP_BRA;
4323 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4325 if (c < 0) c = d; else if (c != d) return -1;
4328 case OP_EXACT: /* Fall through */
4335 if (!inassert) return -1;
4339 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4341 else if (c != scode[1]) return -1;
4345 code += GET(code, 1);
4347 while (*code == OP_ALT);
4355 /*************************************************
4356 * Validate a UTF-8 string *
4357 *************************************************/
4359 /* This function is called (optionally) at the start of compile or match, to
4360 validate that a supposed UTF-8 string is actually valid. The early check means
4361 that subsequent code can assume it is dealing with a valid string. The check
4362 can be turned off for maximum performance, but then consequences of supplying
4363 an invalid string are then undefined.
4366 string points to the string
4367 length length of string, or -1 if the string is zero-terminated
4369 Returns: < 0 if the string is a valid UTF-8 string
4370 >= 0 otherwise; the value is the offset of the bad byte
4374 valid_utf8(const uschar *string, int length)
4376 register const uschar *p;
4380 for (p = string; *p != 0; p++);
4381 length = p - string;
4384 for (p = string; length-- > 0; p++)
4387 register int c = *p;
4388 if (c < 128) continue;
4389 if ((c & 0xc0) != 0xc0) return p - string;
4390 ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4391 if (length < ab) return p - string;
4394 /* Check top bits in the second byte */
4395 if ((*(++p) & 0xc0) != 0x80) return p - string;
4397 /* Check for overlong sequences for each different length */
4400 /* Check for xx00 000x */
4402 if ((c & 0x3e) == 0) return p - string;
4403 continue; /* We know there aren't any more bytes to check */
4405 /* Check for 1110 0000, xx0x xxxx */
4407 if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4410 /* Check for 1111 0000, xx00 xxxx */
4412 if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4415 /* Check for 1111 1000, xx00 0xxx */
4417 if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4420 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4422 if (c == 0xfe || c == 0xff ||
4423 (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4427 /* Check for valid bytes after the 2nd, if any; all must start 10 */
4430 if ((*(++p) & 0xc0) != 0x80) return p - string;
4440 /*************************************************
4441 * Compile a Regular Expression *
4442 *************************************************/
4444 /* This function takes a string and returns a pointer to a block of store
4445 holding a compiled version of the expression.
4448 pattern the regular expression
4449 options various option bits
4450 errorptr pointer to pointer to error text
4451 erroroffset ptr offset in pattern where error was detected
4452 tables pointer to character tables or NULL
4454 Returns: pointer to compiled data block, or NULL on error,
4455 with errorptr and erroroffset set
4459 pcre_compile(const char *pattern, int options, const char **errorptr,
4460 int *erroroffset, const unsigned char *tables)
4463 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4464 int c, firstbyte, reqbyte;
4466 int branch_extra = 0;
4467 int branch_newextra;
4468 int item_count = -1;
4470 int max_name_size = 0;
4471 int lastitemlength = 0;
4476 BOOL inescq = FALSE;
4477 unsigned int brastackptr = 0;
4480 const uschar *codestart;
4482 compile_data compile_block;
4483 int brastack[BRASTACK_SIZE];
4484 uschar bralenstack[BRASTACK_SIZE];
4486 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4487 can do is just return NULL. */
4489 if (errorptr == NULL) return NULL;
4492 /* However, we can give a message for this error */
4494 if (erroroffset == NULL)
4501 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4504 utf8 = (options & PCRE_UTF8) != 0;
4505 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4506 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4512 if ((options & PCRE_UTF8) != 0)
4519 if ((options & ~PUBLIC_OPTIONS) != 0)
4525 /* Set up pointers to the individual character tables */
4527 if (tables == NULL) tables = pcre_default_tables;
4528 compile_block.lcc = tables + lcc_offset;
4529 compile_block.fcc = tables + fcc_offset;
4530 compile_block.cbits = tables + cbits_offset;
4531 compile_block.ctypes = tables + ctypes_offset;
4533 /* Maximum back reference and backref bitmap. This is updated for numeric
4534 references during the first pass, but for named references during the actual
4535 compile pass. The bitmap records up to 31 back references to help in deciding
4536 whether (.*) can be treated as anchored or not. */
4538 compile_block.top_backref = 0;
4539 compile_block.backref_map = 0;
4541 /* Reflect pattern for debugging output */
4543 DPRINTF(("------------------------------------------------------------------\n"));
4544 DPRINTF(("%s\n", pattern));
4546 /* The first thing to do is to make a pass over the pattern to compute the
4547 amount of store required to hold the compiled code. This does not have to be
4548 perfect as long as errors are overestimates. At the same time we can detect any
4549 flag settings right at the start, and extract them. Make an attempt to correct
4550 for any counted white space if an "extended" flag setting appears late in the
4551 pattern. We can't be so clever for #-comments. */
4553 ptr = (const uschar *)(pattern - 1);
4554 while ((c = *(++ptr)) != 0)
4561 /* If we are inside a \Q...\E sequence, all chars are literal */
4565 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4569 /* Otherwise, first check for ignored whitespace and comments */
4571 if ((options & PCRE_EXTENDED) != 0)
4573 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4576 /* The space before the ; is to avoid a warning on a silly compiler
4577 on the Macintosh. */
4578 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4584 item_count++; /* Is zero for the first non-comment item */
4586 /* Allow space for auto callout before every item except quantifiers. */
4588 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4589 c != '*' && c != '+' && c != '?' &&
4590 (c != '{' || !is_counted_repeat(ptr + 1)))
4591 length += 2 + 2*LINK_SIZE;
4595 /* A backslashed item may be an escaped data character or it may be a
4599 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4600 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4602 lastitemlength = 1; /* Default length of last item for repeats */
4604 if (c >= 0) /* Data character */
4606 length += 2; /* For a one-byte character */
4609 if (utf8 && c > 127)
4612 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4613 if (c <= utf8_table1[i]) break;
4615 lastitemlength += i;
4622 /* If \Q, enter "literal" mode */
4630 /* \X is supported only if Unicode property support is compiled */
4636 goto PCRE_ERROR_RETURN;
4640 /* \P and \p are for Unicode properties, but only when the support has
4641 been compiled. Each item needs 2 bytes. */
4643 else if (-c == ESC_P || -c == ESC_p)
4649 if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4653 goto PCRE_ERROR_RETURN;
4657 /* Other escapes need one byte */
4661 /* A back reference needs an additional 2 bytes, plus either one or 5
4662 bytes for a repeat. We also need to keep the value of the highest
4667 int refnum = -c - ESC_REF;
4668 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4669 if (refnum > compile_block.top_backref)
4670 compile_block.top_backref = refnum;
4671 length += 2; /* For single back reference */
4672 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4674 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4675 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4676 if ((min == 0 && (max == 1 || max == -1)) ||
4677 (min == 1 && max == -1))
4680 if (ptr[1] == '?') ptr++;
4685 case '^': /* Single-byte metacharacters */
4692 case '*': /* These repeats won't be after brackets; */
4693 case '+': /* those are handled separately */
4696 goto POSESSIVE; /* A few lines below */
4698 /* This covers the cases of braced repeats after a single char, metachar,
4699 class, or back reference. */
4702 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4703 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4704 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4706 /* These special cases just insert one extra opcode */
4708 if ((min == 0 && (max == 1 || max == -1)) ||
4709 (min == 1 && max == -1))
4712 /* These cases might insert additional copies of a preceding character. */
4718 length -= lastitemlength; /* Uncount the original char or metachar */
4719 if (min > 0) length += 3 + lastitemlength;
4721 length += lastitemlength + ((max > 0)? 3 : 1);
4724 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4726 POSESSIVE: /* Test for possessive quantifier */
4730 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4734 /* An alternation contains an offset to the next branch or ket. If any ims
4735 options changed in the previous branch(es), and/or if we are in a
4736 lookbehind assertion, extra space will be needed at the start of the
4737 branch. This is handled by branch_extra. */
4740 length += 1 + LINK_SIZE + branch_extra;
4743 /* A character class uses 33 characters provided that all the character
4744 values are less than 256. Otherwise, it uses a bit map for low valued
4745 characters, and individual items for others. Don't worry about character
4746 types that aren't allowed in classes - they'll get picked up during the
4747 compile. A character class that contains only one single-byte character
4748 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4749 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4752 if (*(++ptr) == '^')
4754 class_optcount = 10; /* Greater than one */
4757 else class_optcount = 0;
4763 /* Written as a "do" so that an initial ']' is taken as data */
4767 /* Inside \Q...\E everything is literal except \E */
4771 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4777 /* Outside \Q...\E, check for escapes */
4781 c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4782 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4784 /* \b is backspace inside a class; \X is literal */
4786 if (-c == ESC_b) c = '\b';
4787 else if (-c == ESC_X) c = 'X';
4789 /* \Q enters quoting mode */
4791 else if (-c == ESC_Q)
4797 /* Handle escapes that turn into characters */
4799 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4801 /* Escapes that are meta-things. The normal ones just affect the
4802 bit map, but Unicode properties require an XCLASS extended item. */
4806 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4808 if (-c == ESC_p || -c == ESC_P)
4813 length += LINK_SIZE + 2;
4821 /* Check the syntax for POSIX stuff. The bits we actually handle are
4822 checked during the real compile phase. */
4824 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4827 class_optcount = 10; /* Make sure > 1 */
4830 /* Anything else increments the possible optimization count. We have to
4831 detect ranges here so that we can compute the number of extra ranges for
4832 caseless wide characters when UCP support is available. If there are wide
4833 characters, we are going to have to use an XCLASS, even for single
4846 GETCHARLEN(c, ptr, extra);
4854 /* Come here from handling \ above when it escapes to a char value */
4856 NON_SPECIAL_CHARACTER:
4862 uschar const *hyptr = ptr++;
4866 d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4867 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4868 if (-d == ESC_b) d = '\b'; /* backspace */
4869 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4871 else if (ptr[1] != 0 && ptr[1] != ']')
4878 GETCHARLEN(d, ptr, extra);
4885 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4888 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4889 127 for caseless matching, we will need to use an XCLASS. */
4893 class_optcount = 10; /* Ensure > 1 */
4897 goto PCRE_ERROR_RETURN;
4901 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4904 if (!class_utf8) /* Allow for XCLASS overhead */
4907 length += LINK_SIZE + 2;
4911 /* If we have UCP support, find out how many extra ranges are
4912 needed to map the other case of characters within this range. We
4913 have to mimic the range optimization here, because extending the
4914 range upwards might push d over a boundary that makes is use
4915 another byte in the UTF-8 representation. */
4917 if ((options & PCRE_CASELESS) != 0)
4922 while (get_othercase_range(&cc, origd, &occ, &ocd))
4924 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4926 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4927 { /* if there is overlap, */
4928 c = occ; /* noting that if occ < c */
4929 continue; /* we can't have ocd > d */
4930 } /* because a subrange is */
4931 if (ocd > d && occ <= d + 1) /* always shorter than */
4932 { /* the basic range. */
4937 /* An extra item is needed */
4939 length += 1 + ord2utf8(occ, buffer) +
4940 ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4943 #endif /* SUPPORT_UCP */
4945 /* The length of the (possibly extended) range */
4947 length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4949 #endif /* SUPPORT_UTF8 */
4953 /* We have a single character. There is nothing to be done unless we
4954 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4955 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4961 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4964 class_optcount = 10; /* Ensure > 1 */
4965 if (!class_utf8) /* Allow for XCLASS overhead */
4968 length += LINK_SIZE + 2;
4971 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4972 (1 + ord2utf8(c, buffer));
4973 #else /* SUPPORT_UCP */
4974 length += 1 + ord2utf8(c, buffer);
4975 #endif /* SUPPORT_UCP */
4977 #endif /* SUPPORT_UTF8 */
4981 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4983 if (*ptr == 0) /* Missing terminating ']' */
4986 goto PCRE_ERROR_RETURN;
4989 /* We can optimize when there was only one optimizable character. Repeats
4990 for positive and negated single one-byte chars are handled by the general
4991 code. Here, we handle repeats for the class opcodes. */
4993 if (class_optcount == 1) length += 3; else
4997 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
4998 we also need extra for wrapping the whole thing in a sub-pattern. */
5000 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5002 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5003 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5004 if ((min == 0 && (max == 1 || max == -1)) ||
5005 (min == 1 && max == -1))
5011 length += 2 + 2*LINK_SIZE;
5013 else if (ptr[1] == '?') ptr++;
5018 /* Brackets may be genuine groups or special things */
5021 branch_newextra = 0;
5022 bracket_length = 1 + LINK_SIZE;
5024 /* Handle special forms of bracket, which all start (? */
5033 /* Skip over comments entirely */
5036 while (*ptr != 0 && *ptr != ')') ptr++;
5040 goto PCRE_ERROR_RETURN;
5044 /* Non-referencing groups and lookaheads just move the pointer on, and
5045 then behave like a non-special bracket, except that they don't increment
5046 the count of extracting brackets. Ditto for the "once only" bracket,
5047 which is in Perl from version 5.005. */
5056 /* (?R) specifies a recursive call to the regex, which is an extension
5057 to provide the facility which can be obtained by (?p{perl-code}) in
5058 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5060 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5061 the appropriate numbered brackets. This includes both recursive and
5062 non-recursive calls. (?R) is now synonymous with (?0). */
5067 case '0': case '1': case '2': case '3': case '4':
5068 case '5': case '6': case '7': case '8': case '9':
5071 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5075 goto PCRE_ERROR_RETURN;
5077 length += 1 + LINK_SIZE;
5079 /* If this item is quantified, it will get wrapped inside brackets so
5080 as to use the code for quantified brackets. We jump down and use the
5081 code that handles this for real brackets. */
5083 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5085 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
5086 duplength = 5 + 3 * LINK_SIZE;
5087 goto HANDLE_QUANTIFIED_BRACKETS;
5091 /* (?C) is an extension which provides "callout" - to provide a bit of
5092 the functionality of the Perl (?{...}) feature. An optional number may
5093 follow (default is zero). */
5097 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5101 goto PCRE_ERROR_RETURN;
5103 length += 2 + 2*LINK_SIZE;
5106 /* Named subpatterns are an extension copied from Python */
5112 const uschar *p; /* Don't amalgamate; some compilers */
5113 p = ++ptr; /* grumble at autoincrement in declaration */
5114 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5118 goto PCRE_ERROR_RETURN;
5121 if (ptr - p > max_name_size) max_name_size = (ptr - p);
5125 if (*ptr == '=' || *ptr == '>')
5127 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5131 goto PCRE_ERROR_RETURN;
5136 /* Unknown character after (?P */
5139 goto PCRE_ERROR_RETURN;
5141 /* Lookbehinds are in Perl from version 5.005 */
5145 if (*ptr == '=' || *ptr == '!')
5147 branch_newextra = 1 + LINK_SIZE;
5148 length += 1 + LINK_SIZE; /* For the first branch */
5152 goto PCRE_ERROR_RETURN;
5154 /* Conditionals are in Perl from version 5.005. The bracket must either
5155 be followed by a number (for bracket reference) or by an assertion
5156 group, or (a PCRE extension) by 'R' for a recursion test. */
5159 if (ptr[3] == 'R' && ptr[4] == ')')
5164 else if ((digitab[ptr[3]] & ctype_digit) != 0)
5168 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5172 goto PCRE_ERROR_RETURN;
5175 else /* An assertion must follow */
5177 ptr++; /* Can treat like ':' as far as spacing is concerned */
5178 if (ptr[2] != '?' ||
5179 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5181 ptr += 2; /* To get right offset in message */
5183 goto PCRE_ERROR_RETURN;
5188 /* Else loop checking valid options until ) is met. Anything else is an
5189 error. If we are without any brackets, i.e. at top level, the settings
5190 act as if specified in the options, so massage the options immediately.
5191 This is for backward compatibility with Perl 5.004. */
5204 *optset |= PCRE_CASELESS;
5208 *optset |= PCRE_MULTILINE;
5212 *optset |= PCRE_DOTALL;
5216 *optset |= PCRE_EXTENDED;
5220 *optset |= PCRE_EXTRA;
5224 *optset |= PCRE_UNGREEDY;
5231 /* A termination by ')' indicates an options-setting-only item; if
5232 this is at the very start of the pattern (indicated by item_count
5233 being zero), we use it to set the global options. This is helpful
5234 when analyzing the pattern for first characters, etc. Otherwise
5235 nothing is done here and it is handled during the compiling
5238 [Historical note: Up to Perl 5.8, options settings at top level
5239 were always global settings, wherever they appeared in the pattern.
5240 That is, they were equivalent to an external setting. From 5.8
5241 onwards, they apply only to what follows (which is what you might
5245 if (item_count == 0)
5247 options = (options | set) & (~unset);
5248 set = unset = 0; /* To save length */
5249 item_count--; /* To allow for several */
5254 /* A termination by ':' indicates the start of a nested group with
5255 the given options set. This is again handled at compile time, but
5256 we must allow for compiled space if any of the ims options are
5257 set. We also have to allow for resetting space at the end of
5258 the group, which is why 4 is added to the length and not just 2.
5259 If there are several changes of options within the same group, this
5260 will lead to an over-estimate on the length, but this shouldn't
5261 matter very much. We also have to allow for resetting options at
5262 the start of any alternations, which we do by setting
5263 branch_newextra to 2. Finally, we record whether the case-dependent
5264 flag ever changes within the regex. This is used by the "required
5268 if (((set|unset) & PCRE_IMS) != 0)
5271 branch_newextra = 2;
5272 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5276 /* Unrecognized option character */
5280 goto PCRE_ERROR_RETURN;
5284 /* If we hit a closing bracket, that's it - this is a freestanding
5285 option-setting. We need to ensure that branch_extra is updated if
5286 necessary. The only values branch_newextra can have here are 0 or 2.
5287 If the value is 2, then branch_extra must either be 2 or 5, depending
5288 on whether this is a lookbehind group or not. */
5293 if (branch_newextra == 2 &&
5294 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5295 branch_extra += branch_newextra;
5299 /* If options were terminated by ':' control comes here. Fall through
5300 to handle the group below. */
5304 /* Extracting brackets must be counted so we can process escapes in a
5305 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5306 need an additional 3 bytes of store per extracting bracket. However, if
5307 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5308 must leave the count alone (it will aways be zero). */
5310 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5313 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5316 /* Save length for computing whole length at end if there's a repeat that
5317 requires duplication of the group. Also save the current value of
5318 branch_extra, and start the new group with the new value. If non-zero, this
5319 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5321 if (brastackptr >= sizeof(brastack)/sizeof(int))
5324 goto PCRE_ERROR_RETURN;
5327 bralenstack[brastackptr] = branch_extra;
5328 branch_extra = branch_newextra;
5330 brastack[brastackptr++] = length;
5331 length += bracket_length;
5334 /* Handle ket. Look for subsequent max/min; for certain sets of values we
5335 have to replicate this bracket up to that many times. If brastackptr is
5336 0 this is an unmatched bracket which will generate an error, but take care
5337 not to try to access brastack[-1] when computing the length and restoring
5338 the branch_extra value. */
5341 length += 1 + LINK_SIZE;
5342 if (brastackptr > 0)
5344 duplength = length - brastack[--brastackptr];
5345 branch_extra = bralenstack[brastackptr];
5349 /* The following code is also used when a recursion such as (?3) is
5350 followed by a quantifier, because in that case, it has to be wrapped inside
5351 brackets so that the quantifier works. The value of duplength must be
5352 set before arrival. */
5354 HANDLE_QUANTIFIED_BRACKETS:
5356 /* Leave ptr at the final char; for read_repeat_counts this happens
5357 automatically; for the others we need an increment. */
5359 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5361 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5362 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5364 else if (c == '*') { min = 0; max = -1; ptr++; }
5365 else if (c == '+') { min = 1; max = -1; ptr++; }
5366 else if (c == '?') { min = 0; max = 1; ptr++; }
5367 else { min = 1; max = 1; }
5369 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5370 group, and if the maximum is greater than zero, we have to replicate
5371 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5377 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5380 /* When the minimum is greater than zero, we have to replicate up to
5381 minval-1 times, with no additions required in the copies. Then, if there
5382 is a limited maximum we have to replicate up to maxval-1 times allowing
5383 for a BRAZERO item before each optional copy and nesting brackets for all
5384 but one of the optional copies. */
5388 length += (min - 1) * duplength;
5389 if (max > min) /* Need this test as max=-1 means no limit */
5390 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5391 - (2 + 2*LINK_SIZE);
5394 /* Allow space for once brackets for "possessive quantifier" */
5399 length += 2 + 2*LINK_SIZE;
5403 /* Non-special character. It won't be space or # in extended mode, so it is
5404 always a genuine character. If we are in a \Q...\E sequence, check for the
5405 end; if not, we have a literal. */
5410 if (inescq && c == '\\' && ptr[1] == 'E')
5417 length += 2; /* For a one-byte character */
5418 lastitemlength = 1; /* Default length of last item for repeats */
5420 /* In UTF-8 mode, check for additional bytes. */
5423 if (utf8 && (c & 0xc0) == 0xc0)
5425 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5426 { /* because the end is marked */
5427 lastitemlength++; /* by a zero byte. */
5438 length += 2 + LINK_SIZE; /* For final KET and END */
5440 if ((options & PCRE_AUTO_CALLOUT) != 0)
5441 length += 2 + 2*LINK_SIZE; /* For final callout */
5443 if (length > MAX_PATTERN_SIZE)
5449 /* Compute the size of data block needed and get it, either from malloc or
5450 externally provided function. */
5452 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5453 re = (real_pcre *)(pcre_malloc)(size);
5461 /* Put in the magic number, and save the sizes, options, and character table
5462 pointer. NULL is used for the default character tables. The nullpad field is at
5463 the end; it's there to help in the case when a regex compiled on a system with
5464 4-byte pointers is run on another with 8-byte pointers. */
5466 re->magic_number = MAGIC_NUMBER;
5468 re->options = options;
5469 re->dummy1 = re->dummy2 = 0;
5470 re->name_table_offset = sizeof(real_pcre);
5471 re->name_entry_size = max_name_size + 3;
5472 re->name_count = name_count;
5473 re->tables = (tables == pcre_default_tables)? NULL : tables;
5476 /* The starting points of the name/number translation table and of the code are
5477 passed around in the compile data block. */
5479 compile_block.names_found = 0;
5480 compile_block.name_entry_size = max_name_size + 3;
5481 compile_block.name_table = (uschar *)re + re->name_table_offset;
5482 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5483 compile_block.start_code = codestart;
5484 compile_block.start_pattern = (const uschar *)pattern;
5485 compile_block.req_varyopt = 0;
5486 compile_block.nopartial = FALSE;
5488 /* Set up a starting, non-extracting bracket, then compile the expression. On
5489 error, *errorptr will be set non-NULL, so we don't need to look at the result
5490 of the function here. */
5492 ptr = (const uschar *)pattern;
5493 code = (uschar *)codestart;
5496 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5497 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5498 re->top_bracket = bracount;
5499 re->top_backref = compile_block.top_backref;
5501 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5503 /* If not reached end of pattern on success, there's an excess bracket. */
5505 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5507 /* Fill in the terminating state and check for disastrous overflow, but
5508 if debugging, leave the test till after things are printed out. */
5513 if (code - codestart > length) *errorptr = ERR23;
5516 /* Give an error if there's back reference to a non-existent capturing
5519 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5521 /* Failed to compile, or error while post-processing */
5523 if (*errorptr != NULL)
5527 *erroroffset = ptr - (const uschar *)pattern;
5531 /* If the anchored option was not passed, set the flag if we can determine that
5532 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5533 as starting with .* when DOTALL is set).
5535 Otherwise, if we know what the first character has to be, save it, because that
5536 speeds up unanchored matches no end. If not, see if we can set the
5537 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5538 start with ^. and also when all branches start with .* for non-DOTALL matches.
5541 if ((options & PCRE_ANCHORED) == 0)
5543 int temp_options = options;
5544 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5545 re->options |= PCRE_ANCHORED;
5549 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5550 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5552 int ch = firstbyte & 255;
5553 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5554 compile_block.fcc[ch] == ch)? ch : firstbyte;
5555 re->options |= PCRE_FIRSTSET;
5557 else if (is_startline(codestart, 0, compile_block.backref_map))
5558 re->options |= PCRE_STARTLINE;
5562 /* For an anchored pattern, we use the "required byte" only if it follows a
5563 variable length item in the regex. Remove the caseless flag for non-caseable
5567 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5569 int ch = reqbyte & 255;
5570 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5571 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5572 re->options |= PCRE_REQCHSET;
5575 /* Print out the compiled data for debugging */
5579 printf("Length = %d top_bracket = %d top_backref = %d\n",
5580 length, re->top_bracket, re->top_backref);
5582 if (re->options != 0)
5584 printf("%s%s%s%s%s%s%s%s%s%s\n",
5585 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5586 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5587 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5588 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5589 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5590 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5591 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5592 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5593 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5594 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5597 if ((re->options & PCRE_FIRSTSET) != 0)
5599 int ch = re->first_byte & 255;
5600 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5601 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5602 else printf("First char = \\x%02x%s\n", ch, caseless);
5605 if ((re->options & PCRE_REQCHSET) != 0)
5607 int ch = re->req_byte & 255;
5608 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5609 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5610 else printf("Req char = \\x%02x%s\n", ch, caseless);
5613 print_internals(re, stdout);
5615 /* This check is done here in the debugging case so that the code that
5616 was compiled can be seen. */
5618 if (code - codestart > length)
5622 *erroroffset = ptr - (uschar *)pattern;
5632 /*************************************************
5633 * Match a back-reference *
5634 *************************************************/
5636 /* If a back reference hasn't been set, the length that is passed is greater
5637 than the number of characters left in the string, so the match fails.
5640 offset index into the offset vector
5641 eptr points into the subject
5642 length length to be matched
5643 md points to match data block
5646 Returns: TRUE if matched
5650 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5651 unsigned long int ims)
5653 const uschar *p = md->start_subject + md->offset_vector[offset];
5656 if (eptr >= md->end_subject)
5657 printf("matching subject <null>");
5660 printf("matching subject ");
5661 pchars(eptr, length, TRUE, md);
5663 printf(" against backref ");
5664 pchars(p, length, FALSE, md);
5668 /* Always fail if not enough characters left */
5670 if (length > md->end_subject - eptr) return FALSE;
5672 /* Separate the caselesss case for speed */
5674 if ((ims & PCRE_CASELESS) != 0)
5676 while (length-- > 0)
5677 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5680 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5687 /*************************************************
5688 * Match character against an XCLASS *
5689 *************************************************/
5691 /* This function is called from within the XCLASS code below, to match a
5692 character against an extended class which might match values > 255.
5696 data points to the flag byte of the XCLASS data
5698 Returns: TRUE if character matches, else FALSE
5702 match_xclass(int c, const uschar *data)
5705 BOOL negated = (*data & XCL_NOT) != 0;
5707 /* Character values < 256 are matched against a bitmap, if one is present. If
5708 not, we still carry on, because there may be ranges that start below 256 in the
5713 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5714 return !negated; /* char found */
5717 /* First skip the bit map if present. Then match against the list of Unicode
5718 properties or large chars or ranges that end with a large char. We won't ever
5719 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5721 if ((*data++ & XCL_MAP) != 0) data += 32;
5723 while ((t = *data++) != XCL_END)
5726 if (t == XCL_SINGLE)
5728 GETCHARINC(x, data);
5729 if (c == x) return !negated;
5731 else if (t == XCL_RANGE)
5733 GETCHARINC(x, data);
5734 GETCHARINC(y, data);
5735 if (c >= x && c <= y) return !negated;
5739 else /* XCL_PROP & XCL_NOTPROP */
5741 int chartype, othercase;
5742 int rqdtype = *data++;
5743 int category = ucp_findchar(c, &chartype, &othercase);
5746 if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5750 if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5753 #endif /* SUPPORT_UCP */
5756 return negated; /* char did not match */
5761 /***************************************************************************
5762 ****************************************************************************
5763 RECURSION IN THE match() FUNCTION
5765 The match() function is highly recursive. Some regular expressions can cause
5766 it to recurse thousands of times. I was writing for Unix, so I just let it
5767 call itself recursively. This uses the stack for saving everything that has
5768 to be saved for a recursive call. On Unix, the stack can be large, and this
5771 It turns out that on non-Unix systems there are problems with programs that
5772 use a lot of stack. (This despite the fact that every last chip has oodles
5773 of memory these days, and techniques for extending the stack have been known
5774 for decades.) So....
5776 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5777 calls by keeping local variables that need to be preserved in blocks of memory
5778 obtained from malloc instead instead of on the stack. Macros are used to
5779 achieve this so that the actual code doesn't look very different to what it
5781 ****************************************************************************
5782 ***************************************************************************/
5785 /* These versions of the macros use the stack, as normal */
5788 #define REGISTER register
5789 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5790 #define RRETURN(ra) return ra
5794 /* These versions of the macros manage a private stack on the heap. Note
5795 that the rd argument of RMATCH isn't actually used. It's the md argument of
5796 match(), which never changes. */
5800 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5802 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5803 if (setjmp(frame->Xwhere) == 0)\
5805 newframe->Xeptr = ra;\
5806 newframe->Xecode = rb;\
5807 newframe->Xoffset_top = rc;\
5808 newframe->Xims = re;\
5809 newframe->Xeptrb = rf;\
5810 newframe->Xflags = rg;\
5811 newframe->Xprevframe = frame;\
5813 DPRINTF(("restarting from line %d\n", __LINE__));\
5818 DPRINTF(("longjumped back to line %d\n", __LINE__));\
5819 frame = md->thisframe;\
5820 rx = frame->Xresult;\
5824 #define RRETURN(ra)\
5826 heapframe *newframe = frame;\
5827 frame = newframe->Xprevframe;\
5828 (pcre_stack_free)(newframe);\
5831 frame->Xresult = ra;\
5832 md->thisframe = frame;\
5833 longjmp(frame->Xwhere, 1);\
5839 /* Structure for remembering the local variables in a private frame */
5841 typedef struct heapframe {
5842 struct heapframe *Xprevframe;
5844 /* Function arguments that may change */
5846 const uschar *Xeptr;
5847 const uschar *Xecode;
5853 /* Function local variables */
5855 const uschar *Xcallpat;
5856 const uschar *Xcharptr;
5857 const uschar *Xdata;
5858 const uschar *Xnext;
5860 const uschar *Xprev;
5861 const uschar *Xsaved_eptr;
5863 recursion_info Xnew_recursive;
5870 unsigned long int Xoriginal_ims;
5874 int Xprop_fail_result;
5877 int Xprop_othercase;
5878 int Xprop_test_against;
5879 int *Xprop_test_variable;
5891 int Xsave_capture_last;
5892 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5893 int Xstacksave[REC_STACK_SAVE_MAX];
5897 /* Place to pass back result, and where to jump back to */
5907 /***************************************************************************
5908 ***************************************************************************/
5912 /*************************************************
5913 * Match from current position *
5914 *************************************************/
5916 /* On entry ecode points to the first opcode, and eptr to the first character
5917 in the subject string, while eptrb holds the value of eptr at the start of the
5918 last bracketed group - used for breaking infinite loops matching zero-length
5919 strings. This function is called recursively in many circumstances. Whenever it
5920 returns a negative (error) response, the outer incarnation must also return the
5923 Performance note: It might be tempting to extract commonly used fields from the
5924 md structure (e.g. utf8, end_subject) into individual variables to improve
5925 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5926 made performance worse.
5929 eptr pointer in subject
5930 ecode position in code
5931 offset_top current top pointer
5932 md pointer to "static" info for the match
5933 ims current /i, /m, and /s options
5934 eptrb pointer to chain of blocks containing eptr at start of
5935 brackets - for testing for empty matches
5937 match_condassert - this is an assertion condition
5938 match_isgroup - this is the start of a bracketed group
5940 Returns: MATCH_MATCH if matched ) these values are >= 0
5941 MATCH_NOMATCH if failed to match )
5942 a negative PCRE_ERROR_xxx value if aborted by an error condition
5943 (e.g. stopped by recursion limit)
5947 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5948 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5951 /* These variables do not need to be preserved over recursion in this function,
5952 so they can be ordinary variables in all cases. Mark them with "register"
5953 because they are used a lot in loops. */
5955 register int rrc; /* Returns from recursive calls */
5956 register int i; /* Used for loops not involving calls to RMATCH() */
5957 register int c; /* Character values not kept over RMATCH() calls */
5959 /* When recursion is not being used, all "local" variables that have to be
5960 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5961 heap storage. Set up the top-level frame here; others are obtained from the
5962 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5965 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5966 frame->Xprevframe = NULL; /* Marks the top level */
5968 /* Copy in the original argument variables */
5970 frame->Xeptr = eptr;
5971 frame->Xecode = ecode;
5972 frame->Xoffset_top = offset_top;
5974 frame->Xeptrb = eptrb;
5975 frame->Xflags = flags;
5977 /* This is where control jumps back to to effect "recursion" */
5981 /* Macros make the argument variables come from the current frame */
5983 #define eptr frame->Xeptr
5984 #define ecode frame->Xecode
5985 #define offset_top frame->Xoffset_top
5986 #define ims frame->Xims
5987 #define eptrb frame->Xeptrb
5988 #define flags frame->Xflags
5990 /* Ditto for the local variables */
5993 #define charptr frame->Xcharptr
5995 #define callpat frame->Xcallpat
5996 #define data frame->Xdata
5997 #define next frame->Xnext
5998 #define pp frame->Xpp
5999 #define prev frame->Xprev
6000 #define saved_eptr frame->Xsaved_eptr
6002 #define new_recursive frame->Xnew_recursive
6004 #define cur_is_word frame->Xcur_is_word
6005 #define condition frame->Xcondition
6006 #define minimize frame->Xminimize
6007 #define prev_is_word frame->Xprev_is_word
6009 #define original_ims frame->Xoriginal_ims
6012 #define prop_type frame->Xprop_type
6013 #define prop_fail_result frame->Xprop_fail_result
6014 #define prop_category frame->Xprop_category
6015 #define prop_chartype frame->Xprop_chartype
6016 #define prop_othercase frame->Xprop_othercase
6017 #define prop_test_against frame->Xprop_test_against
6018 #define prop_test_variable frame->Xprop_test_variable
6021 #define ctype frame->Xctype
6022 #define fc frame->Xfc
6023 #define fi frame->Xfi
6024 #define length frame->Xlength
6025 #define max frame->Xmax
6026 #define min frame->Xmin
6027 #define number frame->Xnumber
6028 #define offset frame->Xoffset
6029 #define op frame->Xop
6030 #define save_capture_last frame->Xsave_capture_last
6031 #define save_offset1 frame->Xsave_offset1
6032 #define save_offset2 frame->Xsave_offset2
6033 #define save_offset3 frame->Xsave_offset3
6034 #define stacksave frame->Xstacksave
6036 #define newptrb frame->Xnewptrb
6038 /* When recursion is being used, local variables are allocated on the stack and
6039 get preserved during recursion in the normal way. In this environment, fi and
6040 i, and fc and c, can be the same variables. */
6047 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
6048 const uschar *charptr; /* small blocks of the code. My normal */
6049 #endif /* style of coding would have declared */
6050 const uschar *callpat; /* them within each of those blocks. */
6051 const uschar *data; /* However, in order to accommodate the */
6052 const uschar *next; /* version of this code that uses an */
6053 const uschar *pp; /* external "stack" implemented on the */
6054 const uschar *prev; /* heap, it is easier to declare them */
6055 const uschar *saved_eptr; /* all here, so the declarations can */
6056 /* be cut out in a block. The only */
6057 recursion_info new_recursive; /* declarations within blocks below are */
6058 /* for variables that do not have to */
6059 BOOL cur_is_word; /* be preserved over a recursive call */
6060 BOOL condition; /* to RMATCH(). */
6064 unsigned long int original_ims;
6068 int prop_fail_result;
6072 int prop_test_against;
6073 int *prop_test_variable;
6083 int save_capture_last;
6084 int save_offset1, save_offset2, save_offset3;
6085 int stacksave[REC_STACK_SAVE_MAX];
6090 /* These statements are here to stop the compiler complaining about unitialized
6094 prop_fail_result = 0;
6095 prop_test_against = 0;
6096 prop_test_variable = NULL;
6099 /* OK, now we can get on with the real code of the function. Recursion is
6100 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6101 these just turn into a recursive call to match() and a "return", respectively.
6102 However, RMATCH isn't like a function call because it's quite a complicated
6103 macro. It has to be used in one particular way. This shouldn't, however, impact
6104 performance when true recursion is being used. */
6106 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6108 original_ims = ims; /* Save for resetting on ')' */
6110 /* At the start of a bracketed group, add the current subject pointer to the
6111 stack of such pointers, to be re-instated at the end of the group when we hit
6112 the closing ket. When match() is called in other circumstances, we don't add to
6115 if ((flags & match_isgroup) != 0)
6117 newptrb.epb_prev = eptrb;
6118 newptrb.epb_saved_eptr = eptr;
6122 /* Now start processing the operations. */
6129 /* For partial matching, remember if we ever hit the end of the subject after
6130 matching at least one subject character. */
6133 eptr >= md->end_subject &&
6134 eptr > md->start_match)
6137 /* Opening capturing bracket. If there is space in the offset vector, save
6138 the current subject position in the working slot at the top of the vector. We
6139 mustn't change the current values of the data slot, because they may be set
6140 from a previous iteration of this group, and be referred to by a reference
6143 If the bracket fails to match, we need to restore this value and also the
6144 values of the final offsets, in case they were set by a previous iteration of
6147 If there isn't enough space in the offset vector, treat this as if it were a
6148 non-capturing bracket. Don't worry about setting the flag for the error case
6149 here; that is handled in the code for KET. */
6153 number = op - OP_BRA;
6155 /* For extended extraction brackets (large number), we have to fish out the
6156 number from a dummy opcode at the start. */
6158 if (number > EXTRACT_BASIC_MAX)
6159 number = GET2(ecode, 2+LINK_SIZE);
6160 offset = number << 1;
6163 printf("start bracket %d subject=", number);
6164 pchars(eptr, 16, TRUE, md);
6168 if (offset < md->offset_max)
6170 save_offset1 = md->offset_vector[offset];
6171 save_offset2 = md->offset_vector[offset+1];
6172 save_offset3 = md->offset_vector[md->offset_end - number];
6173 save_capture_last = md->capture_last;
6175 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6176 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6180 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6182 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6183 md->capture_last = save_capture_last;
6184 ecode += GET(ecode, 1);
6186 while (*ecode == OP_ALT);
6188 DPRINTF(("bracket %d failed\n", number));
6190 md->offset_vector[offset] = save_offset1;
6191 md->offset_vector[offset+1] = save_offset2;
6192 md->offset_vector[md->offset_end - number] = save_offset3;
6194 RRETURN(MATCH_NOMATCH);
6197 /* Insufficient room for saving captured contents */
6202 /* Other types of node can be handled by a switch */
6206 case OP_BRA: /* Non-capturing bracket: optimized */
6207 DPRINTF(("start bracket 0\n"));
6210 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6213 ecode += GET(ecode, 1);
6215 while (*ecode == OP_ALT);
6216 DPRINTF(("bracket 0 failed\n"));
6217 RRETURN(MATCH_NOMATCH);
6219 /* Conditional group: compilation checked that there are no more than
6220 two branches. If the condition is false, skipping the first branch takes us
6221 past the end if there is only one branch, but that's OK because that is
6222 exactly what going to the ket would do. */
6225 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6227 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6228 condition = (offset == CREF_RECURSE * 2)?
6229 (md->recursive != NULL) :
6230 (offset < offset_top && md->offset_vector[offset] >= 0);
6231 RMATCH(rrc, eptr, ecode + (condition?
6232 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6233 offset_top, md, ims, eptrb, match_isgroup);
6237 /* The condition is an assertion. Call match() to evaluate it - setting
6238 the final argument TRUE causes it to stop at the end of an assertion. */
6242 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6243 match_condassert | match_isgroup);
6244 if (rrc == MATCH_MATCH)
6246 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6247 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6249 else if (rrc != MATCH_NOMATCH)
6251 RRETURN(rrc); /* Need braces because of following else */
6253 else ecode += GET(ecode, 1);
6254 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6258 /* Control never reaches here */
6260 /* Skip over conditional reference or large extraction number data if
6268 /* End of the pattern. If we are in a recursion, we should restore the
6269 offsets appropriately and continue from after the call. */
6272 if (md->recursive != NULL && md->recursive->group_num == 0)
6274 recursion_info *rec = md->recursive;
6275 DPRINTF(("Hit the end in a (?0) recursion\n"));
6276 md->recursive = rec->prevrec;
6277 memmove(md->offset_vector, rec->offset_save,
6278 rec->saved_max * sizeof(int));
6279 md->start_match = rec->save_start;
6281 ecode = rec->after_call;
6285 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6286 string - backtracking will then try other alternatives, if any. */
6288 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6289 md->end_match_ptr = eptr; /* Record where we ended */
6290 md->end_offset_top = offset_top; /* and how many extracts were taken */
6291 RRETURN(MATCH_MATCH);
6293 /* Change option settings */
6298 DPRINTF(("ims set to %02lx\n", ims));
6301 /* Assertion brackets. Check the alternative branches in turn - the
6302 matching won't pass the KET for an assertion. If any one branch matches,
6303 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6304 start of each branch to move the current point backwards, so the code at
6305 this level is identical to the lookahead case. */
6311 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6313 if (rrc == MATCH_MATCH) break;
6314 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6315 ecode += GET(ecode, 1);
6317 while (*ecode == OP_ALT);
6318 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6320 /* If checking an assertion for a condition, return MATCH_MATCH. */
6322 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6324 /* Continue from after the assertion, updating the offsets high water
6325 mark, since extracts may have been taken during the assertion. */
6327 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6328 ecode += 1 + LINK_SIZE;
6329 offset_top = md->end_offset_top;
6332 /* Negative assertion: all branches must fail to match */
6335 case OP_ASSERTBACK_NOT:
6338 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6340 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6341 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6342 ecode += GET(ecode,1);
6344 while (*ecode == OP_ALT);
6346 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6348 ecode += 1 + LINK_SIZE;
6351 /* Move the subject pointer back. This occurs only at the start of
6352 each branch of a lookbehind assertion. If we are too close to the start to
6353 move back, this match function fails. When working with UTF-8 we move
6354 back a number of characters, not bytes. */
6361 for (i = 0; i < c; i++)
6364 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6371 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6374 eptr -= GET(ecode,1);
6375 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6378 /* Skip to next op code */
6380 ecode += 1 + LINK_SIZE;
6383 /* The callout item calls an external function, if one is provided, passing
6384 details of the match so far. This is mainly for debugging, though the
6385 function is able to force a failure. */
6388 if (pcre_callout != NULL)
6390 pcre_callout_block cb;
6391 cb.version = 1; /* Version 1 of the callout block */
6392 cb.callout_number = ecode[1];
6393 cb.offset_vector = md->offset_vector;
6394 cb.subject = (const char *)md->start_subject;
6395 cb.subject_length = md->end_subject - md->start_subject;
6396 cb.start_match = md->start_match - md->start_subject;
6397 cb.current_position = eptr - md->start_subject;
6398 cb.pattern_position = GET(ecode, 2);
6399 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6400 cb.capture_top = offset_top/2;
6401 cb.capture_last = md->capture_last;
6402 cb.callout_data = md->callout_data;
6403 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6404 if (rrc < 0) RRETURN(rrc);
6406 ecode += 2 + 2*LINK_SIZE;
6409 /* Recursion either matches the current regex, or some subexpression. The
6410 offset data is the offset to the starting bracket from the start of the
6411 whole pattern. (This is so that it works from duplicated subpatterns.)
6413 If there are any capturing brackets started but not finished, we have to
6414 save their starting points and reinstate them after the recursion. However,
6415 we don't know how many such there are (offset_top records the completed
6416 total) so we just have to save all the potential data. There may be up to
6417 65535 such values, which is too large to put on the stack, but using malloc
6418 for small numbers seems expensive. As a compromise, the stack is used when
6419 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6420 is used. A problem is what to do if the malloc fails ... there is no way of
6421 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6422 values on the stack, and accept that the rest may be wrong.
6424 There are also other values that have to be saved. We use a chained
6425 sequence of blocks that actually live on the stack. Thanks to Robin Houston
6426 for the original version of this logic. */
6430 callpat = md->start_code + GET(ecode, 1);
6431 new_recursive.group_num = *callpat - OP_BRA;
6433 /* For extended extraction brackets (large number), we have to fish out
6434 the number from a dummy opcode at the start. */
6436 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6437 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6439 /* Add to "recursing stack" */
6441 new_recursive.prevrec = md->recursive;
6442 md->recursive = &new_recursive;
6444 /* Find where to continue from afterwards */
6446 ecode += 1 + LINK_SIZE;
6447 new_recursive.after_call = ecode;
6449 /* Now save the offset data. */
6451 new_recursive.saved_max = md->offset_end;
6452 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6453 new_recursive.offset_save = stacksave;
6456 new_recursive.offset_save =
6457 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6458 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6461 memcpy(new_recursive.offset_save, md->offset_vector,
6462 new_recursive.saved_max * sizeof(int));
6463 new_recursive.save_start = md->start_match;
6464 md->start_match = eptr;
6466 /* OK, now we can do the recursion. For each top-level alternative we
6467 restore the offset and recursion data. */
6469 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6472 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6473 eptrb, match_isgroup);
6474 if (rrc == MATCH_MATCH)
6476 md->recursive = new_recursive.prevrec;
6477 if (new_recursive.offset_save != stacksave)
6478 (pcre_free)(new_recursive.offset_save);
6479 RRETURN(MATCH_MATCH);
6481 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6483 md->recursive = &new_recursive;
6484 memcpy(md->offset_vector, new_recursive.offset_save,
6485 new_recursive.saved_max * sizeof(int));
6486 callpat += GET(callpat, 1);
6488 while (*callpat == OP_ALT);
6490 DPRINTF(("Recursion didn't match\n"));
6491 md->recursive = new_recursive.prevrec;
6492 if (new_recursive.offset_save != stacksave)
6493 (pcre_free)(new_recursive.offset_save);
6494 RRETURN(MATCH_NOMATCH);
6496 /* Control never reaches here */
6498 /* "Once" brackets are like assertion brackets except that after a match,
6499 the point in the subject string is not moved back. Thus there can never be
6500 a move back into the brackets. Friedl calls these "atomic" subpatterns.
6501 Check the alternative branches in turn - the matching won't pass the KET
6502 for this kind of subpattern. If any one branch matches, we carry on as at
6503 the end of a normal bracket, leaving the subject pointer. */
6512 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6513 eptrb, match_isgroup);
6514 if (rrc == MATCH_MATCH) break;
6515 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6516 ecode += GET(ecode,1);
6518 while (*ecode == OP_ALT);
6520 /* If hit the end of the group (which could be repeated), fail */
6522 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6524 /* Continue as from after the assertion, updating the offsets high water
6525 mark, since extracts may have been taken. */
6527 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6529 offset_top = md->end_offset_top;
6530 eptr = md->end_match_ptr;
6532 /* For a non-repeating ket, just continue at this level. This also
6533 happens for a repeating ket if no characters were matched in the group.
6534 This is the forcible breaking of infinite loops as implemented in Perl
6535 5.005. If there is an options reset, it will get obeyed in the normal
6536 course of events. */
6538 if (*ecode == OP_KET || eptr == saved_eptr)
6540 ecode += 1+LINK_SIZE;
6544 /* The repeating kets try the rest of the pattern or restart from the
6545 preceding bracket, in the appropriate order. We need to reset any options
6546 that changed within the bracket before re-running it, so check the next
6549 if (ecode[1+LINK_SIZE] == OP_OPT)
6551 ims = (ims & ~PCRE_IMS) | ecode[4];
6552 DPRINTF(("ims set to %02lx at group repeat\n", ims));
6555 if (*ecode == OP_KETRMIN)
6557 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6559 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6562 else /* OP_KETRMAX */
6564 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6565 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6566 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6567 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6570 RRETURN(MATCH_NOMATCH);
6572 /* An alternation is the end of a branch; scan along to find the end of the
6573 bracketed group and go to there. */
6576 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6579 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6580 that it may occur zero times. It may repeat infinitely, or not at all -
6581 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6582 repeat limits are compiled as a number of copies, with the optional ones
6583 preceded by BRAZERO or BRAMINZERO. */
6588 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6590 do next += GET(next,1); while (*next == OP_ALT);
6591 ecode = next + 1+LINK_SIZE;
6598 do next += GET(next,1); while (*next == OP_ALT);
6599 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6601 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6606 /* End of a group, repeated or non-repeating. If we are at the end of
6607 an assertion "group", stop matching and return MATCH_MATCH, but record the
6608 current high water mark for use by positive assertions. Do this also
6609 for the "once" (not-backup up) groups. */
6615 prev = ecode - GET(ecode, 1);
6616 saved_eptr = eptrb->epb_saved_eptr;
6618 /* Back up the stack of bracket start pointers. */
6620 eptrb = eptrb->epb_prev;
6622 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6623 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6626 md->end_match_ptr = eptr; /* For ONCE */
6627 md->end_offset_top = offset_top;
6628 RRETURN(MATCH_MATCH);
6631 /* In all other cases except a conditional group we have to check the
6632 group number back at the start and if necessary complete handling an
6633 extraction by setting the offsets and bumping the high water mark. */
6635 if (*prev != OP_COND)
6637 number = *prev - OP_BRA;
6639 /* For extended extraction brackets (large number), we have to fish out
6640 the number from a dummy opcode at the start. */
6642 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6643 offset = number << 1;
6646 printf("end bracket %d", number);
6650 /* Test for a numbered group. This includes groups called as a result
6651 of recursion. Note that whole-pattern recursion is coded as a recurse
6652 into group 0, so it won't be picked up here. Instead, we catch it when
6653 the OP_END is reached. */
6657 md->capture_last = number;
6658 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6660 md->offset_vector[offset] =
6661 md->offset_vector[md->offset_end - number];
6662 md->offset_vector[offset+1] = eptr - md->start_subject;
6663 if (offset_top <= offset) offset_top = offset + 2;
6666 /* Handle a recursively called group. Restore the offsets
6667 appropriately and continue from after the call. */
6669 if (md->recursive != NULL && md->recursive->group_num == number)
6671 recursion_info *rec = md->recursive;
6672 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6673 md->recursive = rec->prevrec;
6674 md->start_match = rec->save_start;
6675 memcpy(md->offset_vector, rec->offset_save,
6676 rec->saved_max * sizeof(int));
6677 ecode = rec->after_call;
6684 /* Reset the value of the ims flags, in case they got changed during
6688 DPRINTF(("ims reset to %02lx\n", ims));
6690 /* For a non-repeating ket, just continue at this level. This also
6691 happens for a repeating ket if no characters were matched in the group.
6692 This is the forcible breaking of infinite loops as implemented in Perl
6693 5.005. If there is an options reset, it will get obeyed in the normal
6694 course of events. */
6696 if (*ecode == OP_KET || eptr == saved_eptr)
6698 ecode += 1 + LINK_SIZE;
6702 /* The repeating kets try the rest of the pattern or restart from the
6703 preceding bracket, in the appropriate order. */
6705 if (*ecode == OP_KETRMIN)
6707 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6708 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6709 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6712 else /* OP_KETRMAX */
6714 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6715 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6716 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6717 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6721 RRETURN(MATCH_NOMATCH);
6723 /* Start of subject unless notbol, or after internal newline if multiline */
6726 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6727 if ((ims & PCRE_MULTILINE) != 0)
6729 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6730 RRETURN(MATCH_NOMATCH);
6734 /* ... else fall through */
6736 /* Start of subject assertion */
6739 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6743 /* Start of match assertion */
6746 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6750 /* Assert before internal newline if multiline, or before a terminating
6751 newline unless endonly is set, else end of subject unless noteol is set. */
6754 if ((ims & PCRE_MULTILINE) != 0)
6756 if (eptr < md->end_subject)
6757 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6759 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6765 if (md->noteol) RRETURN(MATCH_NOMATCH);
6768 if (eptr < md->end_subject - 1 ||
6769 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6770 RRETURN(MATCH_NOMATCH);
6775 /* ... else fall through */
6777 /* End of subject assertion (\z) */
6780 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6784 /* End of subject or ending \n assertion (\Z) */
6787 if (eptr < md->end_subject - 1 ||
6788 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6792 /* Word boundary assertions */
6794 case OP_NOT_WORD_BOUNDARY:
6795 case OP_WORD_BOUNDARY:
6798 /* Find out if the previous and current characters are "word" characters.
6799 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6800 be "non-word" characters. */
6805 if (eptr == md->start_subject) prev_is_word = FALSE; else
6807 const uschar *lastptr = eptr - 1;
6808 while((*lastptr & 0xc0) == 0x80) lastptr--;
6809 GETCHAR(c, lastptr);
6810 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6812 if (eptr >= md->end_subject) cur_is_word = FALSE; else
6815 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6821 /* More streamlined when not in UTF-8 mode */
6824 prev_is_word = (eptr != md->start_subject) &&
6825 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6826 cur_is_word = (eptr < md->end_subject) &&
6827 ((md->ctypes[*eptr] & ctype_word) != 0);
6830 /* Now see if the situation is what we want */
6832 if ((*ecode++ == OP_WORD_BOUNDARY)?
6833 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6834 RRETURN(MATCH_NOMATCH);
6838 /* Match a single character type; inline for speed */
6841 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6842 RRETURN(MATCH_NOMATCH);
6843 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6846 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6851 /* Match a single byte, even in UTF-8 mode. This opcode really does match
6852 any byte, even newline, independent of the setting of PCRE_DOTALL. */
6855 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6860 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6861 GETCHARINCTEST(c, eptr);
6866 (md->ctypes[c] & ctype_digit) != 0
6868 RRETURN(MATCH_NOMATCH);
6873 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6874 GETCHARINCTEST(c, eptr);
6879 (md->ctypes[c] & ctype_digit) == 0
6881 RRETURN(MATCH_NOMATCH);
6885 case OP_NOT_WHITESPACE:
6886 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6887 GETCHARINCTEST(c, eptr);
6892 (md->ctypes[c] & ctype_space) != 0
6894 RRETURN(MATCH_NOMATCH);
6899 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6900 GETCHARINCTEST(c, eptr);
6905 (md->ctypes[c] & ctype_space) == 0
6907 RRETURN(MATCH_NOMATCH);
6911 case OP_NOT_WORDCHAR:
6912 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6913 GETCHARINCTEST(c, eptr);
6918 (md->ctypes[c] & ctype_word) != 0
6920 RRETURN(MATCH_NOMATCH);
6925 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6926 GETCHARINCTEST(c, eptr);
6931 (md->ctypes[c] & ctype_word) == 0
6933 RRETURN(MATCH_NOMATCH);
6938 /* Check the next character by Unicode property. We will get here only
6939 if the support is in the binary; otherwise a compile-time error occurs. */
6943 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6944 GETCHARINCTEST(c, eptr);
6946 int chartype, rqdtype;
6948 int category = ucp_findchar(c, &chartype, &othercase);
6950 rqdtype = *(++ecode);
6955 if ((rqdtype - 128 != category) == (op == OP_PROP))
6956 RRETURN(MATCH_NOMATCH);
6960 if ((rqdtype != chartype) == (op == OP_PROP))
6961 RRETURN(MATCH_NOMATCH);
6966 /* Match an extended Unicode sequence. We will get here only if the support
6967 is in the binary; otherwise a compile-time error occurs. */
6970 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6971 GETCHARINCTEST(c, eptr);
6975 int category = ucp_findchar(c, &chartype, &othercase);
6976 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6977 while (eptr < md->end_subject)
6980 if (!md->utf8) c = *eptr; else
6982 GETCHARLEN(c, eptr, len);
6984 category = ucp_findchar(c, &chartype, &othercase);
6985 if (category != ucp_M) break;
6994 /* Match a back reference, possibly repeatedly. Look past the end of the
6995 item to see if there is repeat information following. The code is similar
6996 to that for character classes, but repeated for efficiency. Then obey
6997 similar code to character type repeats - written out again for speed.
6998 However, if the referenced string is the empty string, always treat
6999 it as matched, any number of times (otherwise there could be infinite
7004 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
7005 ecode += 3; /* Advance past item */
7007 /* If the reference is unset, set the length to be longer than the amount
7008 of subject left; this ensures that every attempt at a match fails. We
7009 can't just fail here, because of the possibility of quantifiers with zero
7012 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7013 md->end_subject - eptr + 1 :
7014 md->offset_vector[offset+1] - md->offset_vector[offset];
7016 /* Set up for repetition, or handle the non-repeated case */
7026 c = *ecode++ - OP_CRSTAR;
7027 minimize = (c & 1) != 0;
7028 min = rep_min[c]; /* Pick up values from tables; */
7029 max = rep_max[c]; /* zero for max => infinity */
7030 if (max == 0) max = INT_MAX;
7035 minimize = (*ecode == OP_CRMINRANGE);
7036 min = GET2(ecode, 1);
7037 max = GET2(ecode, 3);
7038 if (max == 0) max = INT_MAX;
7042 default: /* No repeat follows */
7043 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7045 continue; /* With the main loop */
7048 /* If the length of the reference is zero, just continue with the
7051 if (length == 0) continue;
7053 /* First, ensure the minimum number of matches are present. We get back
7054 the length of the reference string explicitly rather than passing the
7055 address of eptr, so that eptr can be a register variable. */
7057 for (i = 1; i <= min; i++)
7059 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7063 /* If min = max, continue at the same level without recursion.
7064 They are not both allowed to be zero. */
7066 if (min == max) continue;
7068 /* If minimizing, keep trying and advancing the pointer */
7072 for (fi = min;; fi++)
7074 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7075 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7076 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7077 RRETURN(MATCH_NOMATCH);
7080 /* Control never gets here */
7083 /* If maximizing, find the longest string and work backwards */
7088 for (i = min; i < max; i++)
7090 if (!match_ref(offset, eptr, length, md, ims)) break;
7095 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7099 RRETURN(MATCH_NOMATCH);
7102 /* Control never gets here */
7106 /* Match a bit-mapped character class, possibly repeatedly. This op code is
7107 used when all the characters in the class have values in the range 0-255,
7108 and either the matching is caseful, or the characters are in the range
7109 0-127 when UTF-8 processing is enabled. The only difference between
7110 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7113 First, look past the end of the item to see if there is repeat information
7114 following. Then obey similar code to character type repeats - written out
7120 data = ecode + 1; /* Save for matching */
7121 ecode += 33; /* Advance past the item */
7131 c = *ecode++ - OP_CRSTAR;
7132 minimize = (c & 1) != 0;
7133 min = rep_min[c]; /* Pick up values from tables; */
7134 max = rep_max[c]; /* zero for max => infinity */
7135 if (max == 0) max = INT_MAX;
7140 minimize = (*ecode == OP_CRMINRANGE);
7141 min = GET2(ecode, 1);
7142 max = GET2(ecode, 3);
7143 if (max == 0) max = INT_MAX;
7147 default: /* No repeat follows */
7152 /* First, ensure the minimum number of matches are present. */
7158 for (i = 1; i <= min; i++)
7160 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7161 GETCHARINC(c, eptr);
7164 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7168 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7174 /* Not UTF-8 mode */
7176 for (i = 1; i <= min; i++)
7178 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7180 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7184 /* If max == min we can continue with the main loop without the
7187 if (min == max) continue;
7189 /* If minimizing, keep testing the rest of the expression and advancing
7190 the pointer while it matches the class. */
7198 for (fi = min;; fi++)
7200 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7201 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7202 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7203 GETCHARINC(c, eptr);
7206 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7210 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7216 /* Not UTF-8 mode */
7218 for (fi = min;; fi++)
7220 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7221 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7222 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7224 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7227 /* Control never gets here */
7230 /* If maximizing, find the longest possible run, then work backwards. */
7240 for (i = min; i < max; i++)
7243 if (eptr >= md->end_subject) break;
7244 GETCHARLEN(c, eptr, len);
7247 if (op == OP_CLASS) break;
7251 if ((data[c/8] & (1 << (c&7))) == 0) break;
7257 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7258 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7259 if (eptr-- == pp) break; /* Stop if tried at original pos */
7265 /* Not UTF-8 mode */
7267 for (i = min; i < max; i++)
7269 if (eptr >= md->end_subject) break;
7271 if ((data[c/8] & (1 << (c&7))) == 0) break;
7276 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7278 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7282 RRETURN(MATCH_NOMATCH);
7285 /* Control never gets here */
7288 /* Match an extended character class. This opcode is encountered only
7289 in UTF-8 mode, because that's the only time it is compiled. */
7294 data = ecode + 1 + LINK_SIZE; /* Save for matching */
7295 ecode += GET(ecode, 1); /* Advance past the item */
7305 c = *ecode++ - OP_CRSTAR;
7306 minimize = (c & 1) != 0;
7307 min = rep_min[c]; /* Pick up values from tables; */
7308 max = rep_max[c]; /* zero for max => infinity */
7309 if (max == 0) max = INT_MAX;
7314 minimize = (*ecode == OP_CRMINRANGE);
7315 min = GET2(ecode, 1);
7316 max = GET2(ecode, 3);
7317 if (max == 0) max = INT_MAX;
7321 default: /* No repeat follows */
7326 /* First, ensure the minimum number of matches are present. */
7328 for (i = 1; i <= min; i++)
7330 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7331 GETCHARINC(c, eptr);
7332 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7335 /* If max == min we can continue with the main loop without the
7338 if (min == max) continue;
7340 /* If minimizing, keep testing the rest of the expression and advancing
7341 the pointer while it matches the class. */
7345 for (fi = min;; fi++)
7347 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7348 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7349 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7350 GETCHARINC(c, eptr);
7351 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7353 /* Control never gets here */
7356 /* If maximizing, find the longest possible run, then work backwards. */
7361 for (i = min; i < max; i++)
7364 if (eptr >= md->end_subject) break;
7365 GETCHARLEN(c, eptr, len);
7366 if (!match_xclass(c, data)) break;
7371 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7372 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7373 if (eptr-- == pp) break; /* Stop if tried at original pos */
7376 RRETURN(MATCH_NOMATCH);
7379 /* Control never gets here */
7381 #endif /* End of XCLASS */
7383 /* Match a single character, casefully */
7391 GETCHARLEN(fc, ecode, length);
7392 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7393 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7398 /* Non-UTF-8 mode */
7400 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7401 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7406 /* Match a single character, caselessly */
7414 GETCHARLEN(fc, ecode, length);
7416 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7418 /* If the pattern character's value is < 128, we have only one byte, and
7419 can use the fast lookup table. */
7423 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7426 /* Otherwise we must pick up the subject character */
7431 GETCHARINC(dc, eptr);
7434 /* If we have Unicode property support, we can use it to test the other
7435 case of the character, if there is one. The result of ucp_findchar() is
7436 < 0 if the char isn't found, and othercase is returned as zero if there
7444 if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7446 RRETURN(MATCH_NOMATCH);
7451 #endif /* SUPPORT_UTF8 */
7453 /* Non-UTF-8 mode */
7455 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7456 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7461 /* Match a single character repeatedly; different opcodes share code. */
7464 min = max = GET2(ecode, 1);
7471 max = GET2(ecode, 1);
7472 minimize = *ecode == OP_MINUPTO;
7482 c = *ecode++ - OP_STAR;
7483 minimize = (c & 1) != 0;
7484 min = rep_min[c]; /* Pick up values from tables; */
7485 max = rep_max[c]; /* zero for max => infinity */
7486 if (max == 0) max = INT_MAX;
7488 /* Common code for all repeated single-character matches. We can give
7489 up quickly if there are fewer than the minimum number of characters left in
7498 GETCHARLEN(fc, ecode, length);
7499 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7502 /* Handle multibyte character matching specially here. There is
7503 support for caseless matching if UCP support is present. */
7513 if ((ims & PCRE_CASELESS) != 0 &&
7514 ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7516 oclength = ord2utf8(othercase, occhars);
7517 #endif /* SUPPORT_UCP */
7519 for (i = 1; i <= min; i++)
7521 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7522 /* Need braces because of following else */
7523 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7526 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7531 if (min == max) continue;
7535 for (fi = min;; fi++)
7537 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7538 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7539 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7540 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7541 /* Need braces because of following else */
7542 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7545 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7549 /* Control never gets here */
7554 for (i = min; i < max; i++)
7556 if (eptr > md->end_subject - length) break;
7557 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7558 else if (oclength == 0) break;
7561 if (memcmp(eptr, occhars, oclength) != 0) break;
7567 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7568 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7571 RRETURN(MATCH_NOMATCH);
7573 /* Control never gets here */
7576 /* If the length of a UTF-8 character is 1, we fall through here, and
7577 obey the code as for non-UTF-8 characters below, though in this case the
7578 value of fc will always be < 128. */
7581 #endif /* SUPPORT_UTF8 */
7583 /* When not in UTF-8 mode, load a single-byte character. */
7585 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7589 /* The value of fc at this point is always less than 256, though we may or
7590 may not be in UTF-8 mode. The code is duplicated for the caseless and
7591 caseful cases, for speed, since matching characters is likely to be quite
7592 common. First, ensure the minimum number of matches are present. If min =
7593 max, continue at the same level without recursing. Otherwise, if
7594 minimizing, keep trying the rest of the expression and advancing one
7595 matching character if failing, up to the maximum. Alternatively, if
7596 maximizing, find the maximum number of characters and work backwards. */
7598 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7601 if ((ims & PCRE_CASELESS) != 0)
7604 for (i = 1; i <= min; i++)
7605 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7606 if (min == max) continue;
7609 for (fi = min;; fi++)
7611 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7613 if (fi >= max || eptr >= md->end_subject ||
7614 fc != md->lcc[*eptr++])
7615 RRETURN(MATCH_NOMATCH);
7617 /* Control never gets here */
7622 for (i = min; i < max; i++)
7624 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7629 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7633 RRETURN(MATCH_NOMATCH);
7635 /* Control never gets here */
7638 /* Caseful comparisons (includes all multi-byte characters) */
7642 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7643 if (min == max) continue;
7646 for (fi = min;; fi++)
7648 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7650 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7651 RRETURN(MATCH_NOMATCH);
7653 /* Control never gets here */
7658 for (i = min; i < max; i++)
7660 if (eptr >= md->end_subject || fc != *eptr) break;
7665 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7669 RRETURN(MATCH_NOMATCH);
7672 /* Control never gets here */
7674 /* Match a negated single one-byte character. The character we are
7675 checking can be multibyte. */
7678 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7680 GETCHARINCTEST(c, eptr);
7681 if ((ims & PCRE_CASELESS) != 0)
7687 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7691 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7695 /* Match a negated single one-byte character repeatedly. This is almost a
7696 repeat of the code for a repeated single character, but I haven't found a
7697 nice way of commoning these up that doesn't require a test of the
7698 positive/negative option for each character match. Maybe that wouldn't add
7699 very much to the time taken, but character matching *is* what this is all
7703 min = max = GET2(ecode, 1);
7710 max = GET2(ecode, 1);
7711 minimize = *ecode == OP_NOTMINUPTO;
7720 case OP_NOTMINQUERY:
7721 c = *ecode++ - OP_NOTSTAR;
7722 minimize = (c & 1) != 0;
7723 min = rep_min[c]; /* Pick up values from tables; */
7724 max = rep_max[c]; /* zero for max => infinity */
7725 if (max == 0) max = INT_MAX;
7727 /* Common code for all repeated single-byte matches. We can give up quickly
7728 if there are fewer than the minimum number of bytes left in the
7732 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7735 /* The code is duplicated for the caseless and caseful cases, for speed,
7736 since matching characters is likely to be quite common. First, ensure the
7737 minimum number of matches are present. If min = max, continue at the same
7738 level without recursing. Otherwise, if minimizing, keep trying the rest of
7739 the expression and advancing one matching character if failing, up to the
7740 maximum. Alternatively, if maximizing, find the maximum number of
7741 characters and work backwards. */
7743 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7746 if ((ims & PCRE_CASELESS) != 0)
7755 for (i = 1; i <= min; i++)
7757 GETCHARINC(d, eptr);
7758 if (d < 256) d = md->lcc[d];
7759 if (fc == d) RRETURN(MATCH_NOMATCH);
7765 /* Not UTF-8 mode */
7767 for (i = 1; i <= min; i++)
7768 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7771 if (min == max) continue;
7780 for (fi = min;; fi++)
7782 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7784 GETCHARINC(d, eptr);
7785 if (d < 256) d = md->lcc[d];
7786 if (fi >= max || eptr >= md->end_subject || fc == d)
7787 RRETURN(MATCH_NOMATCH);
7792 /* Not UTF-8 mode */
7794 for (fi = min;; fi++)
7796 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7797 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7798 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7799 RRETURN(MATCH_NOMATCH);
7802 /* Control never gets here */
7816 for (i = min; i < max; i++)
7819 if (eptr >= md->end_subject) break;
7820 GETCHARLEN(d, eptr, len);
7821 if (d < 256) d = md->lcc[d];
7827 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7829 if (eptr-- == pp) break; /* Stop if tried at original pos */
7835 /* Not UTF-8 mode */
7837 for (i = min; i < max; i++)
7839 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7844 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7850 RRETURN(MATCH_NOMATCH);
7852 /* Control never gets here */
7855 /* Caseful comparisons */
7864 for (i = 1; i <= min; i++)
7866 GETCHARINC(d, eptr);
7867 if (fc == d) RRETURN(MATCH_NOMATCH);
7872 /* Not UTF-8 mode */
7874 for (i = 1; i <= min; i++)
7875 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7878 if (min == max) continue;
7887 for (fi = min;; fi++)
7889 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7890 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7891 GETCHARINC(d, eptr);
7892 if (fi >= max || eptr >= md->end_subject || fc == d)
7893 RRETURN(MATCH_NOMATCH);
7898 /* Not UTF-8 mode */
7900 for (fi = min;; fi++)
7902 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7904 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7905 RRETURN(MATCH_NOMATCH);
7908 /* Control never gets here */
7922 for (i = min; i < max; i++)
7925 if (eptr >= md->end_subject) break;
7926 GETCHARLEN(d, eptr, len);
7932 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7934 if (eptr-- == pp) break; /* Stop if tried at original pos */
7940 /* Not UTF-8 mode */
7942 for (i = min; i < max; i++)
7944 if (eptr >= md->end_subject || fc == *eptr) break;
7949 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7950 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7955 RRETURN(MATCH_NOMATCH);
7958 /* Control never gets here */
7960 /* Match a single character type repeatedly; several different opcodes
7961 share code. This is very similar to the code for single characters, but we
7962 repeat it in the interests of efficiency. */
7965 min = max = GET2(ecode, 1);
7971 case OP_TYPEMINUPTO:
7973 max = GET2(ecode, 1);
7974 minimize = *ecode == OP_TYPEMINUPTO;
7979 case OP_TYPEMINSTAR:
7981 case OP_TYPEMINPLUS:
7983 case OP_TYPEMINQUERY:
7984 c = *ecode++ - OP_TYPESTAR;
7985 minimize = (c & 1) != 0;
7986 min = rep_min[c]; /* Pick up values from tables; */
7987 max = rep_max[c]; /* zero for max => infinity */
7988 if (max == 0) max = INT_MAX;
7990 /* Common code for all repeated single character type matches. Note that
7991 in UTF-8 mode, '.' matches a character of any length, but for the other
7992 character types, the valid characters are all one-byte long. */
7995 ctype = *ecode++; /* Code for the character type */
7998 if (ctype == OP_PROP || ctype == OP_NOTPROP)
8000 prop_fail_result = ctype == OP_NOTPROP;
8001 prop_type = *ecode++;
8002 if (prop_type >= 128)
8004 prop_test_against = prop_type - 128;
8005 prop_test_variable = &prop_category;
8009 prop_test_against = prop_type;
8010 prop_test_variable = &prop_chartype;
8013 else prop_type = -1;
8016 /* First, ensure the minimum number of matches are present. Use inline
8017 code for maximizing the speed, and do the type test once at the start
8018 (i.e. keep it out of the loop). Also we can test that there are at least
8019 the minimum number of bytes before we start. This isn't as effective in
8020 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8021 is tidier. Also separate the UCP code, which can be the same for both UTF-8
8022 and single-bytes. */
8024 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8030 for (i = 1; i <= min; i++)
8032 GETCHARINC(c, eptr);
8033 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8034 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8035 RRETURN(MATCH_NOMATCH);
8039 /* Match extended Unicode sequences. We will get here only if the
8040 support is in the binary; otherwise a compile-time error occurs. */
8042 else if (ctype == OP_EXTUNI)
8044 for (i = 1; i <= min; i++)
8046 GETCHARINCTEST(c, eptr);
8047 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8048 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8049 while (eptr < md->end_subject)
8052 if (!md->utf8) c = *eptr; else
8054 GETCHARLEN(c, eptr, len);
8056 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8057 if (prop_category != ucp_M) break;
8064 #endif /* SUPPORT_UCP */
8066 /* Handle all other cases when the coding is UTF-8 */
8069 if (md->utf8) switch(ctype)
8072 for (i = 1; i <= min; i++)
8074 if (eptr >= md->end_subject ||
8075 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8076 RRETURN(MATCH_NOMATCH);
8077 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8086 for (i = 1; i <= min; i++)
8088 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8089 GETCHARINC(c, eptr);
8090 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8091 RRETURN(MATCH_NOMATCH);
8096 for (i = 1; i <= min; i++)
8098 if (eptr >= md->end_subject ||
8099 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8100 RRETURN(MATCH_NOMATCH);
8101 /* No need to skip more bytes - we know it's a 1-byte character */
8105 case OP_NOT_WHITESPACE:
8106 for (i = 1; i <= min; i++)
8108 if (eptr >= md->end_subject ||
8109 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8110 RRETURN(MATCH_NOMATCH);
8111 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8116 for (i = 1; i <= min; i++)
8118 if (eptr >= md->end_subject ||
8119 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8120 RRETURN(MATCH_NOMATCH);
8121 /* No need to skip more bytes - we know it's a 1-byte character */
8125 case OP_NOT_WORDCHAR:
8126 for (i = 1; i <= min; i++)
8128 if (eptr >= md->end_subject ||
8129 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8130 RRETURN(MATCH_NOMATCH);
8131 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8136 for (i = 1; i <= min; i++)
8138 if (eptr >= md->end_subject ||
8139 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8140 RRETURN(MATCH_NOMATCH);
8141 /* No need to skip more bytes - we know it's a 1-byte character */
8146 RRETURN(PCRE_ERROR_INTERNAL);
8147 } /* End switch(ctype) */
8150 #endif /* SUPPORT_UTF8 */
8152 /* Code for the non-UTF-8 case for minimum matching of operators other
8153 than OP_PROP and OP_NOTPROP. */
8158 if ((ims & PCRE_DOTALL) == 0)
8160 for (i = 1; i <= min; i++)
8161 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8171 for (i = 1; i <= min; i++)
8172 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8176 for (i = 1; i <= min; i++)
8177 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8180 case OP_NOT_WHITESPACE:
8181 for (i = 1; i <= min; i++)
8182 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8186 for (i = 1; i <= min; i++)
8187 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8190 case OP_NOT_WORDCHAR:
8191 for (i = 1; i <= min; i++)
8192 if ((md->ctypes[*eptr++] & ctype_word) != 0)
8193 RRETURN(MATCH_NOMATCH);
8197 for (i = 1; i <= min; i++)
8198 if ((md->ctypes[*eptr++] & ctype_word) == 0)
8199 RRETURN(MATCH_NOMATCH);
8203 RRETURN(PCRE_ERROR_INTERNAL);
8207 /* If min = max, continue at the same level without recursing */
8209 if (min == max) continue;
8211 /* If minimizing, we have to test the rest of the pattern before each
8212 subsequent match. Again, separate the UTF-8 case for speed, and also
8213 separate the UCP cases. */
8220 for (fi = min;; fi++)
8222 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8223 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8224 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8225 GETCHARINC(c, eptr);
8226 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8227 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8228 RRETURN(MATCH_NOMATCH);
8232 /* Match extended Unicode sequences. We will get here only if the
8233 support is in the binary; otherwise a compile-time error occurs. */
8235 else if (ctype == OP_EXTUNI)
8237 for (fi = min;; fi++)
8239 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8240 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8241 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8242 GETCHARINCTEST(c, eptr);
8243 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8244 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8245 while (eptr < md->end_subject)
8248 if (!md->utf8) c = *eptr; else
8250 GETCHARLEN(c, eptr, len);
8252 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8253 if (prop_category != ucp_M) break;
8260 #endif /* SUPPORT_UCP */
8266 for (fi = min;; fi++)
8268 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8269 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8270 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8272 GETCHARINC(c, eptr);
8276 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8283 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8284 RRETURN(MATCH_NOMATCH);
8288 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8289 RRETURN(MATCH_NOMATCH);
8292 case OP_NOT_WHITESPACE:
8293 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8294 RRETURN(MATCH_NOMATCH);
8298 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8299 RRETURN(MATCH_NOMATCH);
8302 case OP_NOT_WORDCHAR:
8303 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8304 RRETURN(MATCH_NOMATCH);
8308 if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8309 RRETURN(MATCH_NOMATCH);
8313 RRETURN(PCRE_ERROR_INTERNAL);
8319 /* Not UTF-8 mode */
8321 for (fi = min;; fi++)
8323 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8324 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8325 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8330 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8337 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8341 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8344 case OP_NOT_WHITESPACE:
8345 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8349 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8352 case OP_NOT_WORDCHAR:
8353 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8357 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8361 RRETURN(PCRE_ERROR_INTERNAL);
8365 /* Control never gets here */
8368 /* If maximizing it is worth using inline code for speed, doing the type
8369 test once at the start (i.e. keep it out of the loop). Again, keep the
8370 UTF-8 and UCP stuff separate. */
8374 pp = eptr; /* Remember where we started */
8379 for (i = min; i < max; i++)
8382 if (eptr >= md->end_subject) break;
8383 GETCHARLEN(c, eptr, len);
8384 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8385 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8390 /* eptr is now past the end of the maximum run */
8394 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8395 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8396 if (eptr-- == pp) break; /* Stop if tried at original pos */
8401 /* Match extended Unicode sequences. We will get here only if the
8402 support is in the binary; otherwise a compile-time error occurs. */
8404 else if (ctype == OP_EXTUNI)
8406 for (i = min; i < max; i++)
8408 if (eptr >= md->end_subject) break;
8409 GETCHARINCTEST(c, eptr);
8410 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8411 if (prop_category == ucp_M) break;
8412 while (eptr < md->end_subject)
8415 if (!md->utf8) c = *eptr; else
8417 GETCHARLEN(c, eptr, len);
8419 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8420 if (prop_category != ucp_M) break;
8425 /* eptr is now past the end of the maximum run */
8429 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8431 if (eptr-- == pp) break; /* Stop if tried at original pos */
8432 for (;;) /* Move back over one extended */
8436 if (!md->utf8) c = *eptr; else
8438 GETCHARLEN(c, eptr, len);
8440 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8441 if (prop_category != ucp_M) break;
8448 #endif /* SUPPORT_UCP */
8459 /* Special code is required for UTF8, but when the maximum is unlimited
8460 we don't need it, so we repeat the non-UTF8 code. This is probably
8461 worth it, because .* is quite a common idiom. */
8465 if ((ims & PCRE_DOTALL) == 0)
8467 for (i = min; i < max; i++)
8469 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8471 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8476 for (i = min; i < max; i++)
8479 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8484 /* Handle unlimited UTF-8 repeat */
8488 if ((ims & PCRE_DOTALL) == 0)
8490 for (i = min; i < max; i++)
8492 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8500 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8506 /* The byte case is the same as non-UTF8 */
8510 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8515 for (i = min; i < max; i++)
8518 if (eptr >= md->end_subject) break;
8519 GETCHARLEN(c, eptr, len);
8520 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8526 for (i = min; i < max; i++)
8529 if (eptr >= md->end_subject) break;
8530 GETCHARLEN(c, eptr, len);
8531 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8536 case OP_NOT_WHITESPACE:
8537 for (i = min; i < max; i++)
8540 if (eptr >= md->end_subject) break;
8541 GETCHARLEN(c, eptr, len);
8542 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8548 for (i = min; i < max; i++)
8551 if (eptr >= md->end_subject) break;
8552 GETCHARLEN(c, eptr, len);
8553 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8558 case OP_NOT_WORDCHAR:
8559 for (i = min; i < max; i++)
8562 if (eptr >= md->end_subject) break;
8563 GETCHARLEN(c, eptr, len);
8564 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8570 for (i = min; i < max; i++)
8573 if (eptr >= md->end_subject) break;
8574 GETCHARLEN(c, eptr, len);
8575 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8581 RRETURN(PCRE_ERROR_INTERNAL);
8584 /* eptr is now past the end of the maximum run */
8588 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8589 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8590 if (eptr-- == pp) break; /* Stop if tried at original pos */
8597 /* Not UTF-8 mode */
8602 if ((ims & PCRE_DOTALL) == 0)
8604 for (i = min; i < max; i++)
8606 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8611 /* For DOTALL case, fall through and treat as \C */
8615 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8620 for (i = min; i < max; i++)
8622 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8629 for (i = min; i < max; i++)
8631 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8637 case OP_NOT_WHITESPACE:
8638 for (i = min; i < max; i++)
8640 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8647 for (i = min; i < max; i++)
8649 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8655 case OP_NOT_WORDCHAR:
8656 for (i = min; i < max; i++)
8658 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8665 for (i = min; i < max; i++)
8667 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8674 RRETURN(PCRE_ERROR_INTERNAL);
8677 /* eptr is now past the end of the maximum run */
8681 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8683 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8687 /* Get here if we can't make it match with any permitted repetitions */
8689 RRETURN(MATCH_NOMATCH);
8691 /* Control never gets here */
8693 /* There's been some horrible disaster. Since all codes > OP_BRA are
8694 for capturing brackets, and there shouldn't be any gaps between 0 and
8695 OP_BRA, arrival here can only mean there is something seriously wrong
8696 in the code above or the OP_xxx definitions. */
8699 DPRINTF(("Unknown opcode %d\n", *ecode));
8700 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8703 /* Do not stick any code in here without much thought; it is assumed
8704 that "continue" in the code above comes out to here to repeat the main
8707 } /* End of main loop */
8708 /* Control never reaches here */
8712 /***************************************************************************
8713 ****************************************************************************
8714 RECURSION IN THE match() FUNCTION
8716 Undefine all the macros that were defined above to handle this. */
8734 #undef new_recursive
8750 #undef save_capture_last
8760 /* These two are defined as macros in both cases */
8765 /***************************************************************************
8766 ***************************************************************************/
8770 /*************************************************
8771 * Execute a Regular Expression *
8772 *************************************************/
8774 /* This function applies a compiled re to a subject string and picks out
8775 portions of the string if it matches. Two elements in the vector are set for
8776 each substring: the offsets to the start and end of the substring.
8779 argument_re points to the compiled expression
8780 extra_data points to extra data or is NULL
8781 subject points to the subject string
8782 length length of subject string (may contain binary zeros)
8783 start_offset where to start in the subject string
8785 offsets points to a vector of ints to be filled in with offsets
8786 offsetcount the number of elements in the vector
8788 Returns: > 0 => success; value is the number of elements filled in
8789 = 0 => success, but offsets is not big enough
8790 -1 => failed to match
8791 < -1 => some kind of unexpected problem
8795 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8796 const char *subject, int length, int start_offset, int options, int *offsets,
8799 int rc, resetcount, ocount;
8800 int first_byte = -1;
8803 unsigned long int ims = 0;
8804 BOOL using_temporary_offsets = FALSE;
8807 BOOL first_byte_caseless = FALSE;
8808 BOOL req_byte_caseless = FALSE;
8809 match_data match_block;
8810 const uschar *tables;
8811 const uschar *start_bits = NULL;
8812 const uschar *start_match = (const uschar *)subject + start_offset;
8813 const uschar *end_subject;
8814 const uschar *req_byte_ptr = start_match - 1;
8816 pcre_study_data internal_study;
8817 const pcre_study_data *study;
8819 real_pcre internal_re;
8820 const real_pcre *external_re = (const real_pcre *)argument_re;
8821 const real_pcre *re = external_re;
8823 /* Plausibility checks */
8825 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8826 if (re == NULL || subject == NULL ||
8827 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8828 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8830 /* Fish out the optional data from the extra_data structure, first setting
8831 the default values. */
8834 match_block.match_limit = MATCH_LIMIT;
8835 match_block.callout_data = NULL;
8837 /* The table pointer is always in native byte order. */
8839 tables = external_re->tables;
8841 if (extra_data != NULL)
8843 register unsigned int flags = extra_data->flags;
8844 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8845 study = (const pcre_study_data *)extra_data->study_data;
8846 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8847 match_block.match_limit = extra_data->match_limit;
8848 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8849 match_block.callout_data = extra_data->callout_data;
8850 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8853 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
8854 is a feature that makes it possible to save compiled regex and re-use them
8855 in other programs later. */
8857 if (tables == NULL) tables = pcre_default_tables;
8859 /* Check that the first field in the block is the magic number. If it is not,
8860 test for a regex that was compiled on a host of opposite endianness. If this is
8861 the case, flipped values are put in internal_re and internal_study if there was
8864 if (re->magic_number != MAGIC_NUMBER)
8866 re = try_flipped(re, &internal_re, study, &internal_study);
8867 if (re == NULL) return PCRE_ERROR_BADMAGIC;
8868 if (study != NULL) study = &internal_study;
8871 /* Set up other data */
8873 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8874 startline = (re->options & PCRE_STARTLINE) != 0;
8876 /* The code starts after the real_pcre block and the capture name table. */
8878 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8879 re->name_count * re->name_entry_size;
8881 match_block.start_subject = (const uschar *)subject;
8882 match_block.start_offset = start_offset;
8883 match_block.end_subject = match_block.start_subject + length;
8884 end_subject = match_block.end_subject;
8886 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8887 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8889 match_block.notbol = (options & PCRE_NOTBOL) != 0;
8890 match_block.noteol = (options & PCRE_NOTEOL) != 0;
8891 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8892 match_block.partial = (options & PCRE_PARTIAL) != 0;
8893 match_block.hitend = FALSE;
8895 match_block.recursive = NULL; /* No recursion at top level */
8897 match_block.lcc = tables + lcc_offset;
8898 match_block.ctypes = tables + ctypes_offset;
8900 /* Partial matching is supported only for a restricted set of regexes at the
8903 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8904 return PCRE_ERROR_BADPARTIAL;
8906 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
8907 back the character offset. */
8910 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8912 if (valid_utf8((uschar *)subject, length) >= 0)
8913 return PCRE_ERROR_BADUTF8;
8914 if (start_offset > 0 && start_offset < length)
8916 int tb = ((uschar *)subject)[start_offset];
8920 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8926 /* The ims options can vary during the matching as a result of the presence
8927 of (?ims) items in the pattern. They are kept in a local variable so that
8928 restoring at the exit of a group is easy. */
8930 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8932 /* If the expression has got more back references than the offsets supplied can
8933 hold, we get a temporary chunk of working store to use during the matching.
8934 Otherwise, we can use the vector supplied, rounding down its size to a multiple
8937 ocount = offsetcount - (offsetcount % 3);
8939 if (re->top_backref > 0 && re->top_backref >= ocount/3)
8941 ocount = re->top_backref * 3 + 3;
8942 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8943 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8944 using_temporary_offsets = TRUE;
8945 DPRINTF(("Got memory to hold back references\n"));
8947 else match_block.offset_vector = offsets;
8949 match_block.offset_end = ocount;
8950 match_block.offset_max = (2*ocount)/3;
8951 match_block.offset_overflow = FALSE;
8952 match_block.capture_last = -1;
8954 /* Compute the minimum number of offsets that we need to reset each time. Doing
8955 this makes a huge difference to execution time when there aren't many brackets
8958 resetcount = 2 + re->top_bracket * 2;
8959 if (resetcount > offsetcount) resetcount = ocount;
8961 /* Reset the working variable associated with each extraction. These should
8962 never be used unless previously set, but they get saved and restored, and so we
8963 initialize them to avoid reading uninitialized locations. */
8965 if (match_block.offset_vector != NULL)
8967 register int *iptr = match_block.offset_vector + ocount;
8968 register int *iend = iptr - resetcount/2 + 1;
8969 while (--iptr >= iend) *iptr = -1;
8972 /* Set up the first character to match, if available. The first_byte value is
8973 never set for an anchored regular expression, but the anchoring may be forced
8974 at run time, so we have to test for anchoring. The first char may be unset for
8975 an unanchored pattern, of course. If there's no first char and the pattern was
8976 studied, there may be a bitmap of possible first characters. */
8980 if ((re->options & PCRE_FIRSTSET) != 0)
8982 first_byte = re->first_byte & 255;
8983 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8984 first_byte = match_block.lcc[first_byte];
8987 if (!startline && study != NULL &&
8988 (study->options & PCRE_STUDY_MAPPED) != 0)
8989 start_bits = study->start_bits;
8992 /* For anchored or unanchored matches, there may be a "last known required
8995 if ((re->options & PCRE_REQCHSET) != 0)
8997 req_byte = re->req_byte & 255;
8998 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
8999 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
9002 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
9003 the loop runs just once. */
9007 /* Reset the maximum number of extractions we might see. */
9009 if (match_block.offset_vector != NULL)
9011 register int *iptr = match_block.offset_vector;
9012 register int *iend = iptr + resetcount;
9013 while (iptr < iend) *iptr++ = -1;
9016 /* Advance to a unique first char if possible */
9018 if (first_byte >= 0)
9020 if (first_byte_caseless)
9021 while (start_match < end_subject &&
9022 match_block.lcc[*start_match] != first_byte)
9025 while (start_match < end_subject && *start_match != first_byte)
9029 /* Or to just after \n for a multiline match if possible */
9033 if (start_match > match_block.start_subject + start_offset)
9035 while (start_match < end_subject && start_match[-1] != NEWLINE)
9040 /* Or to a non-unique first char after study */
9042 else if (start_bits != NULL)
9044 while (start_match < end_subject)
9046 register unsigned int c = *start_match;
9047 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9051 #ifdef DEBUG /* Sigh. Some compilers never learn. */
9052 printf(">>>> Match against: ");
9053 pchars(start_match, end_subject - start_match, TRUE, &match_block);
9057 /* If req_byte is set, we know that that character must appear in the subject
9058 for the match to succeed. If the first character is set, req_byte must be
9059 later in the subject; otherwise the test starts at the match point. This
9060 optimization can save a huge amount of backtracking in patterns with nested
9061 unlimited repeats that aren't going to match. Writing separate code for
9062 cased/caseless versions makes it go faster, as does using an autoincrement
9063 and backing off on a match.
9065 HOWEVER: when the subject string is very, very long, searching to its end can
9066 take a long time, and give bad performance on quite ordinary patterns. This
9067 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9068 don't do this when the string is sufficiently long.
9070 ALSO: this processing is disabled when partial matching is requested.
9073 if (req_byte >= 0 &&
9074 end_subject - start_match < REQ_BYTE_MAX &&
9075 !match_block.partial)
9077 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9079 /* We don't need to repeat the search if we haven't yet reached the
9080 place we found it at last time. */
9082 if (p > req_byte_ptr)
9084 if (req_byte_caseless)
9086 while (p < end_subject)
9088 register int pp = *p++;
9089 if (pp == req_byte || pp == req_byte2) { p--; break; }
9094 while (p < end_subject)
9096 if (*p++ == req_byte) { p--; break; }
9100 /* If we can't find the required character, break the matching loop */
9102 if (p >= end_subject) break;
9104 /* If we have found the required character, save the point where we
9105 found it, so that we don't search again next time round the loop if
9106 the start hasn't passed this character yet. */
9112 /* When a match occurs, substrings will be set for all internal extractions;
9113 we just need to set up the whole thing as substring 0 before returning. If
9114 there were too many extractions, set the return code to zero. In the case
9115 where we had to get some local store to hold offsets for backreferences, copy
9116 those back references that we can. In this case there need not be overflow
9117 if certain parts of the pattern were not used. */
9119 match_block.start_match = start_match;
9120 match_block.match_call_count = 0;
9122 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9125 if (rc == MATCH_NOMATCH)
9129 if (match_block.utf8)
9130 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9136 if (rc != MATCH_MATCH)
9138 DPRINTF((">>>> error: returning %d\n", rc));
9142 /* We have a match! Copy the offset information from temporary store if
9145 if (using_temporary_offsets)
9147 if (offsetcount >= 4)
9149 memcpy(offsets + 2, match_block.offset_vector + 2,
9150 (offsetcount - 2) * sizeof(int));
9151 DPRINTF(("Copied offsets from temporary memory\n"));
9153 if (match_block.end_offset_top > offsetcount)
9154 match_block.offset_overflow = TRUE;
9156 DPRINTF(("Freeing temporary memory\n"));
9157 (pcre_free)(match_block.offset_vector);
9160 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9162 if (offsetcount < 2) rc = 0; else
9164 offsets[0] = start_match - match_block.start_subject;
9165 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9168 DPRINTF((">>>> returning %d\n", rc));
9172 /* This "while" is the end of the "do" above */
9174 while (!anchored && start_match <= end_subject);
9176 if (using_temporary_offsets)
9178 DPRINTF(("Freeing temporary memory\n"));
9179 (pcre_free)(match_block.offset_vector);
9182 if (match_block.partial && match_block.hitend)
9184 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9185 return PCRE_ERROR_PARTIAL;
9189 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9190 return PCRE_ERROR_NOMATCH;