1 /* $Cambridge: exim/src/src/pcre/pcre.c,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
8 This is a library of functions to support regular expressions whose syntax
9 and semantics are as close as possible to those of the Perl 5 language. See
10 the file Tech.Notes for some information on the internals.
12 Written by: Philip Hazel <ph10@cam.ac.uk>
14 Copyright (c) 1997-2004 University of Cambridge
16 -----------------------------------------------------------------------------
17 Redistribution and use in source and binary forms, with or without
18 modification, are permitted provided that the following conditions are met:
20 * Redistributions of source code must retain the above copyright notice,
21 this list of conditions and the following disclaimer.
23 * Redistributions in binary form must reproduce the above copyright
24 notice, this list of conditions and the following disclaimer in the
25 documentation and/or other materials provided with the distribution.
27 * Neither the name of the University of Cambridge nor the names of its
28 contributors may be used to endorse or promote products derived from
29 this software without specific prior written permission.
31 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
32 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
33 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
35 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
36 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
37 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
38 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
39 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
41 POSSIBILITY OF SUCH DAMAGE.
42 -----------------------------------------------------------------------------
46 /* Define DEBUG to get debugging output on stdout. */
49 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
50 inline, and there are *still* stupid compilers about that don't like indented
51 pre-processor statements. I suppose it's only been 10 years... */
54 #define DPRINTF(p) printf p
56 #define DPRINTF(p) /*nothing*/
59 /* Include the internals header, which itself includes "config.h", the Standard
60 C headers, and the external pcre header. */
64 /* If Unicode Property support is wanted, include a private copy of the
65 function that does it, and the table that translates names to numbers. */
69 #include "ucptypetable.c"
72 /* Maximum number of items on the nested bracket stacks at compile time. This
73 applies to the nesting of all kinds of parentheses. It does not limit
74 un-nested, non-capturing parentheses. This number can be made bigger if
75 necessary - it is used to dimension one int and one unsigned char vector at
78 #define BRASTACK_SIZE 200
81 /* Maximum number of ints of offset to save on the stack for recursive calls.
82 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
83 because the offset vector is always a multiple of 3 long. */
85 #define REC_STACK_SAVE_MAX 30
88 /* The maximum remaining length of subject we are prepared to search for a
91 #define REQ_BYTE_MAX 1000
94 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
95 the definition is next to the definition of the opcodes in internal.h. */
97 static const uschar OP_lengths[] = { OP_LENGTHS };
99 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
101 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
102 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
104 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
105 are simple data values; negative values are for special things like \d and so
106 on. Zero means further processing is needed (for things like \x), or the escape
109 #if !EBCDIC /* This is the "normal" table for ASCII systems */
110 static const short int escapes[] = {
111 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
112 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
113 '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
114 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
115 -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
116 -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
117 '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
118 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
119 -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
120 0, 0, -ESC_z /* x - z */
123 #else /* This is the "abnormal" table for EBCDIC systems */
124 static const short int escapes[] = {
125 /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
126 /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
127 /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
128 /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
129 /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
130 /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
131 /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
132 /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
133 /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
134 /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
135 /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
136 /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
137 /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
138 /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
139 /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
140 /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
141 /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
142 /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
143 /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
144 /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
145 /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
146 /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
147 /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
152 /* Tables of names of POSIX character classes and their lengths. The list is
153 terminated by a zero length entry. The first three must be alpha, upper, lower,
154 as this is assumed for handling case independence. */
156 static const char *const posix_names[] = {
157 "alpha", "lower", "upper",
158 "alnum", "ascii", "blank", "cntrl", "digit", "graph",
159 "print", "punct", "space", "word", "xdigit" };
161 static const uschar posix_name_lengths[] = {
162 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
164 /* Table of class bit maps for each POSIX class; up to three may be combined
165 to form the class. The table for [:blank:] is dynamically modified to remove
166 the vertical space characters. */
168 static const int posix_class_maps[] = {
169 cbit_lower, cbit_upper, -1, /* alpha */
170 cbit_lower, -1, -1, /* lower */
171 cbit_upper, -1, -1, /* upper */
172 cbit_digit, cbit_lower, cbit_upper, /* alnum */
173 cbit_print, cbit_cntrl, -1, /* ascii */
174 cbit_space, -1, -1, /* blank - a GNU extension */
175 cbit_cntrl, -1, -1, /* cntrl */
176 cbit_digit, -1, -1, /* digit */
177 cbit_graph, -1, -1, /* graph */
178 cbit_print, -1, -1, /* print */
179 cbit_punct, -1, -1, /* punct */
180 cbit_space, -1, -1, /* space */
181 cbit_word, -1, -1, /* word - a Perl extension */
182 cbit_xdigit,-1, -1 /* xdigit */
185 /* Table to identify digits and hex digits. This is used when compiling
186 patterns. Note that the tables in chartables are dependent on the locale, and
187 may mark arbitrary characters as digits - but the PCRE compiling code expects
188 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
189 a private table here. It costs 256 bytes, but it is a lot faster than doing
190 character value tests (at least in some simple cases I timed), and in some
191 applications one wants PCRE to compile efficiently as well as match
194 For convenience, we use the same bit definitions as in chartables:
197 0x08 hexadecimal digit
199 Then we can use ctype_digit and ctype_xdigit in the code. */
201 #if !EBCDIC /* This is the "normal" case, for ASCII systems */
202 static const unsigned char digitab[] =
204 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
205 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
206 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
207 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
208 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
209 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
210 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
211 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
212 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
213 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
214 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
215 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
216 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
217 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
218 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
219 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
220 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
221 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
222 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
223 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
224 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
225 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
226 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
227 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
228 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
229 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
230 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
231 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
232 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
233 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
234 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
235 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
237 #else /* This is the "abnormal" case, for EBCDIC systems */
238 static const unsigned char digitab[] =
240 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
241 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
242 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
243 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
244 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
245 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
246 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
247 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
248 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
249 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
250 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
251 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ¬ */
252 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
253 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
254 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
255 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
256 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
257 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
258 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
259 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
260 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
261 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
262 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
263 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
264 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
265 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
266 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
267 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
268 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
269 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
270 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
271 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
273 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
274 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
275 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
276 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
277 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
278 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
279 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
280 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
281 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
282 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
283 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
284 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
285 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ¬ */
286 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
287 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
288 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
289 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
290 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
291 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
292 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
293 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
294 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
295 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
296 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
297 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
298 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
299 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
300 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
301 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
302 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
303 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
304 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
305 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
309 /* Definition to allow mutual recursion */
312 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
313 BOOL, int, int *, int *, branch_chain *, compile_data *);
315 /* Structure for building a chain of data that actually lives on the
316 stack, for holding the values of the subject pointer at the start of each
317 subpattern, so as to detect when an empty string has been matched by a
318 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
319 are on the heap, not on the stack. */
321 typedef struct eptrblock {
322 struct eptrblock *epb_prev;
323 const uschar *epb_saved_eptr;
326 /* Flag bits for the match() function */
328 #define match_condassert 0x01 /* Called to check a condition assertion */
329 #define match_isgroup 0x02 /* Set if start of bracketed group */
331 /* Non-error returns from the match() function. Error returns are externally
332 defined PCRE_ERROR_xxx codes, which are all negative. */
334 #define MATCH_MATCH 1
335 #define MATCH_NOMATCH 0
339 /*************************************************
341 *************************************************/
343 /* PCRE is thread-clean and doesn't use any global variables in the normal
344 sense. However, it calls memory allocation and free functions via the four
345 indirections below, and it can optionally do callouts. These values can be
346 changed by the caller, but are shared between all threads. However, when
347 compiling for Virtual Pascal, things are done differently (see pcre.in). */
351 extern "C" void *(*pcre_malloc)(size_t) = malloc;
352 extern "C" void (*pcre_free)(void *) = free;
353 extern "C" void *(*pcre_stack_malloc)(size_t) = malloc;
354 extern "C" void (*pcre_stack_free)(void *) = free;
355 extern "C" int (*pcre_callout)(pcre_callout_block *) = NULL;
357 void *(*pcre_malloc)(size_t) = malloc;
358 void (*pcre_free)(void *) = free;
359 void *(*pcre_stack_malloc)(size_t) = malloc;
360 void (*pcre_stack_free)(void *) = free;
361 int (*pcre_callout)(pcre_callout_block *) = NULL;
366 /*************************************************
367 * Macros and tables for character handling *
368 *************************************************/
370 /* When UTF-8 encoding is being used, a character is no longer just a single
371 byte. The macros for character handling generate simple sequences when used in
372 byte-mode, and more complicated ones for UTF-8 characters. */
375 #define GETCHAR(c, eptr) c = *eptr;
376 #define GETCHARINC(c, eptr) c = *eptr++;
377 #define GETCHARINCTEST(c, eptr) c = *eptr++;
378 #define GETCHARLEN(c, eptr, len) c = *eptr;
379 #define BACKCHAR(eptr)
381 #else /* SUPPORT_UTF8 */
383 /* Get the next UTF-8 character, not advancing the pointer. This is called when
384 we know we are in UTF-8 mode. */
386 #define GETCHAR(c, eptr) \
388 if ((c & 0xc0) == 0xc0) \
391 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
393 c = (c & utf8_table3[gcaa]) << gcss; \
394 for (gcii = 1; gcii <= gcaa; gcii++) \
397 c |= (eptr[gcii] & 0x3f) << gcss; \
401 /* Get the next UTF-8 character, advancing the pointer. This is called when we
402 know we are in UTF-8 mode. */
404 #define GETCHARINC(c, eptr) \
406 if ((c & 0xc0) == 0xc0) \
408 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
410 c = (c & utf8_table3[gcaa]) << gcss; \
414 c |= (*eptr++ & 0x3f) << gcss; \
418 /* Get the next character, testing for UTF-8 mode, and advancing the pointer */
420 #define GETCHARINCTEST(c, eptr) \
422 if (md->utf8 && (c & 0xc0) == 0xc0) \
424 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
426 c = (c & utf8_table3[gcaa]) << gcss; \
430 c |= (*eptr++ & 0x3f) << gcss; \
434 /* Get the next UTF-8 character, not advancing the pointer, incrementing length
435 if there are extra bytes. This is called when we know we are in UTF-8 mode. */
437 #define GETCHARLEN(c, eptr, len) \
439 if ((c & 0xc0) == 0xc0) \
442 int gcaa = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
444 c = (c & utf8_table3[gcaa]) << gcss; \
445 for (gcii = 1; gcii <= gcaa; gcii++) \
448 c |= (eptr[gcii] & 0x3f) << gcss; \
453 /* If the pointer is not at the start of a character, move it back until
454 it is. Called only in UTF-8 mode. */
456 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
462 /*************************************************
463 * Default character tables *
464 *************************************************/
466 /* A default set of character tables is included in the PCRE binary. Its source
467 is built by the maketables auxiliary program, which uses the default C ctypes
468 functions, and put in the file chartables.c. These tables are used by PCRE
469 whenever the caller of pcre_compile() does not provide an alternate set of
472 #include "chartables.c"
477 /*************************************************
478 * Tables for UTF-8 support *
479 *************************************************/
481 /* These are the breakpoints for different numbers of bytes in a UTF-8
484 static const int utf8_table1[] =
485 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
487 /* These are the indicator bits and the mask for the data bits to set in the
488 first byte of a character, indexed by the number of additional bytes. */
490 static const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
491 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
493 /* Table of the number of extra characters, indexed by the first character
494 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
497 static const uschar utf8_table4[] = {
498 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
499 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
500 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
501 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
504 /*************************************************
505 * Convert character value to UTF-8 *
506 *************************************************/
508 /* This function takes an integer value in the range 0 - 0x7fffffff
509 and encodes it as a UTF-8 character in 0 to 6 bytes.
512 cvalue the character value
513 buffer pointer to buffer for result - at least 6 bytes long
515 Returns: number of characters placed in the buffer
519 ord2utf8(int cvalue, uschar *buffer)
522 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
523 if (cvalue <= utf8_table1[i]) break;
525 for (j = i; j > 0; j--)
527 *buffer-- = 0x80 | (cvalue & 0x3f);
530 *buffer = utf8_table2[i] | cvalue;
537 /*************************************************
538 * Print compiled regex *
539 *************************************************/
541 /* The code for doing this is held in a separate file that is also included in
542 pcretest.c. It defines a function called print_internals(). */
545 #include "printint.c"
550 /*************************************************
551 * Return version string *
552 *************************************************/
554 #define STRING(a) # a
555 #define XSTRING(s) STRING(s)
560 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
566 /*************************************************
567 * Flip bytes in an integer *
568 *************************************************/
570 /* This function is called when the magic number in a regex doesn't match in
571 order to flip its bytes to see if we are dealing with a pattern that was
572 compiled on a host of different endianness. If so, this function is used to
573 flip other byte values.
576 value the number to flip
577 n the number of bytes to flip (assumed to be 2 or 4)
579 Returns: the flipped value
583 byteflip(long int value, int n)
585 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
586 return ((value & 0x000000ff) << 24) |
587 ((value & 0x0000ff00) << 8) |
588 ((value & 0x00ff0000) >> 8) |
589 ((value & 0xff000000) >> 24);
594 /*************************************************
595 * Test for a byte-flipped compiled regex *
596 *************************************************/
598 /* This function is called from pce_exec() and also from pcre_fullinfo(). Its
599 job is to test whether the regex is byte-flipped - that is, it was compiled on
600 a system of opposite endianness. The function is called only when the native
601 MAGIC_NUMBER test fails. If the regex is indeed flipped, we flip all the
602 relevant values into a different data block, and return it.
605 re points to the regex
606 study points to study data, or NULL
607 internal_re points to a new regex block
608 internal_study points to a new study block
610 Returns: the new block if is is indeed a byte-flipped regex
615 try_flipped(const real_pcre *re, real_pcre *internal_re,
616 const pcre_study_data *study, pcre_study_data *internal_study)
618 if (byteflip(re->magic_number, sizeof(re->magic_number)) != MAGIC_NUMBER)
621 *internal_re = *re; /* To copy other fields */
622 internal_re->size = byteflip(re->size, sizeof(re->size));
623 internal_re->options = byteflip(re->options, sizeof(re->options));
624 internal_re->top_bracket = byteflip(re->top_bracket, sizeof(re->top_bracket));
625 internal_re->top_backref = byteflip(re->top_backref, sizeof(re->top_backref));
626 internal_re->first_byte = byteflip(re->first_byte, sizeof(re->first_byte));
627 internal_re->req_byte = byteflip(re->req_byte, sizeof(re->req_byte));
628 internal_re->name_table_offset = byteflip(re->name_table_offset,
629 sizeof(re->name_table_offset));
630 internal_re->name_entry_size = byteflip(re->name_entry_size,
631 sizeof(re->name_entry_size));
632 internal_re->name_count = byteflip(re->name_count, sizeof(re->name_count));
636 *internal_study = *study; /* To copy other fields */
637 internal_study->size = byteflip(study->size, sizeof(study->size));
638 internal_study->options = byteflip(study->options, sizeof(study->options));
646 /*************************************************
647 * (Obsolete) Return info about compiled pattern *
648 *************************************************/
650 /* This is the original "info" function. It picks potentially useful data out
651 of the private structure, but its interface was too rigid. It remains for
652 backwards compatibility. The public options are passed back in an int - though
653 the re->options field has been expanded to a long int, all the public options
654 at the low end of it, and so even on 16-bit systems this will still be OK.
655 Therefore, I haven't changed the API for pcre_info().
658 argument_re points to compiled code
659 optptr where to pass back the options
660 first_byte where to pass back the first character,
661 or -1 if multiline and all branches start ^,
664 Returns: number of capturing subpatterns
665 or negative values on error
669 pcre_info(const pcre *argument_re, int *optptr, int *first_byte)
671 real_pcre internal_re;
672 const real_pcre *re = (const real_pcre *)argument_re;
673 if (re == NULL) return PCRE_ERROR_NULL;
674 if (re->magic_number != MAGIC_NUMBER)
676 re = try_flipped(re, &internal_re, NULL, NULL);
677 if (re == NULL) return PCRE_ERROR_BADMAGIC;
679 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
680 if (first_byte != NULL)
681 *first_byte = ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
682 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
683 return re->top_bracket;
688 /*************************************************
689 * Return info about compiled pattern *
690 *************************************************/
692 /* This is a newer "info" function which has an extensible interface so
693 that additional items can be added compatibly.
696 argument_re points to compiled code
697 extra_data points extra data, or NULL
698 what what information is required
699 where where to put the information
701 Returns: 0 if data returned, negative on error
705 pcre_fullinfo(const pcre *argument_re, const pcre_extra *extra_data, int what,
708 real_pcre internal_re;
709 pcre_study_data internal_study;
710 const real_pcre *re = (const real_pcre *)argument_re;
711 const pcre_study_data *study = NULL;
713 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
715 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_STUDY_DATA) != 0)
716 study = (const pcre_study_data *)extra_data->study_data;
718 if (re->magic_number != MAGIC_NUMBER)
720 re = try_flipped(re, &internal_re, study, &internal_study);
721 if (re == NULL) return PCRE_ERROR_BADMAGIC;
722 if (study != NULL) study = &internal_study;
727 case PCRE_INFO_OPTIONS:
728 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
732 *((size_t *)where) = re->size;
735 case PCRE_INFO_STUDYSIZE:
736 *((size_t *)where) = (study == NULL)? 0 : study->size;
739 case PCRE_INFO_CAPTURECOUNT:
740 *((int *)where) = re->top_bracket;
743 case PCRE_INFO_BACKREFMAX:
744 *((int *)where) = re->top_backref;
747 case PCRE_INFO_FIRSTBYTE:
749 ((re->options & PCRE_FIRSTSET) != 0)? re->first_byte :
750 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
753 /* Make sure we pass back the pointer to the bit vector in the external
754 block, not the internal copy (with flipped integer fields). */
756 case PCRE_INFO_FIRSTTABLE:
757 *((const uschar **)where) =
758 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
759 ((const pcre_study_data *)extra_data->study_data)->start_bits : NULL;
762 case PCRE_INFO_LASTLITERAL:
764 ((re->options & PCRE_REQCHSET) != 0)? re->req_byte : -1;
767 case PCRE_INFO_NAMEENTRYSIZE:
768 *((int *)where) = re->name_entry_size;
771 case PCRE_INFO_NAMECOUNT:
772 *((int *)where) = re->name_count;
775 case PCRE_INFO_NAMETABLE:
776 *((const uschar **)where) = (const uschar *)re + re->name_table_offset;
779 case PCRE_INFO_DEFAULT_TABLES:
780 *((const uschar **)where) = (const uschar *)pcre_default_tables;
783 default: return PCRE_ERROR_BADOPTION;
791 /*************************************************
792 * Return info about what features are configured *
793 *************************************************/
795 /* This is function which has an extensible interface so that additional items
796 can be added compatibly.
799 what what information is required
800 where where to put the information
802 Returns: 0 if data returned, negative on error
806 pcre_config(int what, void *where)
810 case PCRE_CONFIG_UTF8:
818 case PCRE_CONFIG_UNICODE_PROPERTIES:
826 case PCRE_CONFIG_NEWLINE:
827 *((int *)where) = NEWLINE;
830 case PCRE_CONFIG_LINK_SIZE:
831 *((int *)where) = LINK_SIZE;
834 case PCRE_CONFIG_POSIX_MALLOC_THRESHOLD:
835 *((int *)where) = POSIX_MALLOC_THRESHOLD;
838 case PCRE_CONFIG_MATCH_LIMIT:
839 *((unsigned int *)where) = MATCH_LIMIT;
842 case PCRE_CONFIG_STACKRECURSE:
850 default: return PCRE_ERROR_BADOPTION;
859 /*************************************************
860 * Debugging function to print chars *
861 *************************************************/
863 /* Print a sequence of chars in printable format, stopping at the end of the
864 subject if the requested.
867 p points to characters
868 length number to print
869 is_subject TRUE if printing from within md->start_subject
870 md pointer to matching data block, if is_subject is TRUE
876 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
879 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
881 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
888 /*************************************************
890 *************************************************/
892 /* This function is called when a \ has been encountered. It either returns a
893 positive value for a simple escape such as \n, or a negative value which
894 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
895 a positive value greater than 255 may be returned. On entry, ptr is pointing at
896 the \. On exit, it is on the final character of the escape sequence.
899 ptrptr points to the pattern position pointer
900 errorptr points to the pointer to the error message
901 bracount number of previous extracting brackets
902 options the options bits
903 isclass TRUE if inside a character class
905 Returns: zero or positive => a data character
906 negative => a special escape sequence
907 on error, errorptr is set
911 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
912 int options, BOOL isclass)
914 const uschar *ptr = *ptrptr;
917 /* If backslash is at the end of the pattern, it's an error. */
920 if (c == 0) *errorptr = ERR1;
922 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
923 a table. A non-zero result is something that can be returned immediately.
924 Otherwise further processing may be required. */
926 #if !EBCDIC /* ASCII coding */
927 else if (c < '0' || c > 'z') {} /* Not alphameric */
928 else if ((i = escapes[c - '0']) != 0) c = i;
930 #else /* EBCDIC coding */
931 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
932 else if ((i = escapes[c - 0x48]) != 0) c = i;
935 /* Escapes that need further processing, or are illegal. */
939 const uschar *oldptr;
942 /* A number of Perl escapes are not handled by PCRE. We give an explicit
953 /* The handling of escape sequences consisting of a string of digits
954 starting with one that is not zero is not straightforward. By experiment,
955 the way Perl works seems to be as follows:
957 Outside a character class, the digits are read as a decimal number. If the
958 number is less than 10, or if there are that many previous extracting
959 left brackets, then it is a back reference. Otherwise, up to three octal
960 digits are read to form an escaped byte. Thus \123 is likely to be octal
961 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
962 value is greater than 377, the least significant 8 bits are taken. Inside a
963 character class, \ followed by a digit is always an octal number. */
965 case '1': case '2': case '3': case '4': case '5':
966 case '6': case '7': case '8': case '9':
972 while ((digitab[ptr[1]] & ctype_digit) != 0)
973 c = c * 10 + *(++ptr) - '0';
974 if (c < 10 || c <= bracount)
979 ptr = oldptr; /* Put the pointer back and fall through */
982 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
983 generates a binary zero byte and treats the digit as a following literal.
984 Thus we have to pull back the pointer by one. */
986 if ((c = *ptr) >= '8')
993 /* \0 always starts an octal number, but we may drop through to here with a
994 larger first octal digit. */
998 while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
999 c = c * 8 + *(++ptr) - '0';
1000 c &= 255; /* Take least significant 8 bits */
1003 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
1004 which can be greater than 0xff, but only if the ddd are hex digits. */
1008 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
1010 const uschar *pt = ptr + 2;
1011 register int count = 0;
1013 while ((digitab[*pt] & ctype_xdigit) != 0)
1017 #if !EBCDIC /* ASCII coding */
1018 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1019 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1020 #else /* EBCDIC coding */
1021 if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
1022 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1027 if (c < 0 || count > 8) *errorptr = ERR34;
1031 /* If the sequence of hex digits does not end with '}', then we don't
1032 recognize this construct; fall through to the normal \x handling. */
1036 /* Read just a single hex char */
1039 while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
1041 int cc; /* Some compilers don't like ++ */
1042 cc = *(++ptr); /* in initializers */
1043 #if !EBCDIC /* ASCII coding */
1044 if (cc >= 'a') cc -= 32; /* Convert to upper case */
1045 c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
1046 #else /* EBCDIC coding */
1047 if (cc <= 'z') cc += 64; /* Convert to upper case */
1048 c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
1053 /* Other special escapes not starting with a digit are straightforward */
1063 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
1064 is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
1065 (However, an EBCDIC equivalent has now been added.) */
1067 #if !EBCDIC /* ASCII coding */
1068 if (c >= 'a' && c <= 'z') c -= 32;
1070 #else /* EBCDIC coding */
1071 if (c >= 'a' && c <= 'z') c += 64;
1076 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1077 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
1078 for Perl compatibility, it is a literal. This code looks a bit odd, but
1079 there used to be some cases other than the default, and there may be again
1080 in future, so I haven't "optimized" it. */
1083 if ((options & PCRE_EXTRA) != 0) switch(c)
1100 /*************************************************
1101 * Handle \P and \p *
1102 *************************************************/
1104 /* This function is called after \P or \p has been encountered, provided that
1105 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1106 pointing at the P or p. On exit, it is pointing at the final character of the
1110 ptrptr points to the pattern position pointer
1111 negptr points to a boolean that is set TRUE for negation else FALSE
1112 errorptr points to the pointer to the error message
1114 Returns: value from ucp_type_table, or -1 for an invalid type
1118 get_ucp(const uschar **ptrptr, BOOL *negptr, const char **errorptr)
1121 const uschar *ptr = *ptrptr;
1125 if (c == 0) goto ERROR_RETURN;
1129 /* \P or \p can be followed by a one- or two-character name in {}, optionally
1130 preceded by ^ for negation. */
1139 for (i = 0; i <= 2; i++)
1142 if (c == 0) goto ERROR_RETURN;
1143 if (c == '}') break;
1146 if (c !='}') /* Try to distinguish error cases */
1148 while (*(++ptr) != 0 && *ptr != '}');
1149 if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
1154 /* Otherwise there is just one following character */
1164 /* Search for a recognized property name using binary chop */
1167 top = sizeof(utt)/sizeof(ucp_type_table);
1172 c = strcmp(name, utt[i].name);
1173 if (c == 0) return utt[i].value;
1174 if (c > 0) bot = i + 1; else top = i;
1192 /*************************************************
1193 * Check for counted repeat *
1194 *************************************************/
1196 /* This function is called when a '{' is encountered in a place where it might
1197 start a quantifier. It looks ahead to see if it really is a quantifier or not.
1198 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
1199 where the ddds are digits.
1202 p pointer to the first char after '{'
1204 Returns: TRUE or FALSE
1208 is_counted_repeat(const uschar *p)
1210 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1211 while ((digitab[*p] & ctype_digit) != 0) p++;
1212 if (*p == '}') return TRUE;
1214 if (*p++ != ',') return FALSE;
1215 if (*p == '}') return TRUE;
1217 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
1218 while ((digitab[*p] & ctype_digit) != 0) p++;
1225 /*************************************************
1226 * Read repeat counts *
1227 *************************************************/
1229 /* Read an item of the form {n,m} and return the values. This is called only
1230 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1231 so the syntax is guaranteed to be correct, but we need to check the values.
1234 p pointer to first char after '{'
1235 minp pointer to int for min
1236 maxp pointer to int for max
1237 returned as -1 if no max
1238 errorptr points to pointer to error message
1240 Returns: pointer to '}' on success;
1241 current ptr on error, with errorptr set
1244 static const uschar *
1245 read_repeat_counts(const uschar *p, int *minp, int *maxp, const char **errorptr)
1250 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
1252 if (*p == '}') max = min; else
1257 while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
1266 /* Do paranoid checks, then fill in the required variables, and pass back the
1267 pointer to the terminating '}'. */
1269 if (min > 65535 || max > 65535)
1281 /*************************************************
1282 * Find first significant op code *
1283 *************************************************/
1285 /* This is called by several functions that scan a compiled expression looking
1286 for a fixed first character, or an anchoring op code etc. It skips over things
1287 that do not influence this. For some calls, a change of option is important.
1288 For some calls, it makes sense to skip negative forward and all backward
1289 assertions, and also the \b assertion; for others it does not.
1292 code pointer to the start of the group
1293 options pointer to external options
1294 optbit the option bit whose changing is significant, or
1296 skipassert TRUE if certain assertions are to be skipped
1298 Returns: pointer to the first significant opcode
1301 static const uschar*
1302 first_significant_code(const uschar *code, int *options, int optbit,
1310 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1311 *options = (int)code[1];
1317 case OP_ASSERTBACK_NOT:
1318 if (!skipassert) return code;
1319 do code += GET(code, 1); while (*code == OP_ALT);
1320 code += OP_lengths[*code];
1323 case OP_WORD_BOUNDARY:
1324 case OP_NOT_WORD_BOUNDARY:
1325 if (!skipassert) return code;
1331 code += OP_lengths[*code];
1338 /* Control never reaches here */
1344 /*************************************************
1345 * Find the fixed length of a pattern *
1346 *************************************************/
1348 /* Scan a pattern and compute the fixed length of subject that will match it,
1349 if the length is fixed. This is needed for dealing with backward assertions.
1350 In UTF8 mode, the result is in characters rather than bytes.
1353 code points to the start of the pattern (the bracket)
1354 options the compiling options
1356 Returns: the fixed length, or -1 if there is no fixed length,
1357 or -2 if \C was encountered
1361 find_fixedlength(uschar *code, int options)
1365 register int branchlength = 0;
1366 register uschar *cc = code + 1 + LINK_SIZE;
1368 /* Scan along the opcodes for this branch. If we get to the end of the
1369 branch, check the length against that of the other branches. */
1374 register int op = *cc;
1375 if (op >= OP_BRA) op = OP_BRA;
1382 d = find_fixedlength(cc, options);
1383 if (d < 0) return d;
1385 do cc += GET(cc, 1); while (*cc == OP_ALT);
1386 cc += 1 + LINK_SIZE;
1389 /* Reached end of a branch; if it's a ket it is the end of a nested
1390 call. If it's ALT it is an alternation in a nested call. If it is
1391 END it's the end of the outer call. All can be handled by the same code. */
1398 if (length < 0) length = branchlength;
1399 else if (length != branchlength) return -1;
1400 if (*cc != OP_ALT) return length;
1401 cc += 1 + LINK_SIZE;
1405 /* Skip over assertive subpatterns */
1410 case OP_ASSERTBACK_NOT:
1411 do cc += GET(cc, 1); while (*cc == OP_ALT);
1414 /* Skip over things that don't match chars */
1427 case OP_NOT_WORD_BOUNDARY:
1428 case OP_WORD_BOUNDARY:
1429 cc += OP_lengths[*cc];
1432 /* Handle literal characters */
1439 if ((options & PCRE_UTF8) != 0)
1441 while ((*cc & 0xc0) == 0x80) cc++;
1446 /* Handle exact repetitions. The count is already in characters, but we
1447 need to skip over a multibyte character in UTF8 mode. */
1450 branchlength += GET2(cc,1);
1453 if ((options & PCRE_UTF8) != 0)
1455 while((*cc & 0x80) == 0x80) cc++;
1461 branchlength += GET2(cc,1);
1465 /* Handle single-char matchers */
1474 case OP_NOT_WHITESPACE:
1476 case OP_NOT_WORDCHAR:
1483 /* The single-byte matcher isn't allowed */
1488 /* Check a class for variable quantification */
1492 cc += GET(cc, 1) - 33;
1510 if (GET2(cc,1) != GET2(cc,3)) return -1;
1511 branchlength += GET2(cc,1);
1520 /* Anything else is variable length */
1526 /* Control never gets here */
1532 /*************************************************
1533 * Scan compiled regex for numbered bracket *
1534 *************************************************/
1536 /* This little function scans through a compiled pattern until it finds a
1537 capturing bracket with the given number.
1540 code points to start of expression
1541 utf8 TRUE in UTF-8 mode
1542 number the required bracket number
1544 Returns: pointer to the opcode for the bracket, or NULL if not found
1547 static const uschar *
1548 find_bracket(const uschar *code, BOOL utf8, int number)
1550 #ifndef SUPPORT_UTF8
1551 utf8 = utf8; /* Stop pedantic compilers complaining */
1556 register int c = *code;
1557 if (c == OP_END) return NULL;
1558 else if (c > OP_BRA)
1561 if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
1562 if (n == number) return (uschar *)code;
1563 code += OP_lengths[OP_BRA];
1567 code += OP_lengths[c];
1571 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1572 by a multi-byte character. The length in the table is a minimum, so we have
1573 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1574 can use relatively efficient code. */
1589 while ((*code & 0xc0) == 0x80) code++;
1592 /* XCLASS is used for classes that cannot be represented just by a bit
1593 map. This includes negated single high-valued characters. The length in
1594 the table is zero; the actual length is stored in the compiled code. */
1597 code += GET(code, 1) + 1;
1607 /*************************************************
1608 * Scan compiled regex for recursion reference *
1609 *************************************************/
1611 /* This little function scans through a compiled pattern until it finds an
1612 instance of OP_RECURSE.
1615 code points to start of expression
1616 utf8 TRUE in UTF-8 mode
1618 Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
1621 static const uschar *
1622 find_recurse(const uschar *code, BOOL utf8)
1624 #ifndef SUPPORT_UTF8
1625 utf8 = utf8; /* Stop pedantic compilers complaining */
1630 register int c = *code;
1631 if (c == OP_END) return NULL;
1632 else if (c == OP_RECURSE) return code;
1633 else if (c > OP_BRA)
1635 code += OP_lengths[OP_BRA];
1639 code += OP_lengths[c];
1643 /* In UTF-8 mode, opcodes that are followed by a character may be followed
1644 by a multi-byte character. The length in the table is a minimum, so we have
1645 to scan along to skip the extra bytes. All opcodes are less than 128, so we
1646 can use relatively efficient code. */
1661 while ((*code & 0xc0) == 0x80) code++;
1664 /* XCLASS is used for classes that cannot be represented just by a bit
1665 map. This includes negated single high-valued characters. The length in
1666 the table is zero; the actual length is stored in the compiled code. */
1669 code += GET(code, 1) + 1;
1679 /*************************************************
1680 * Scan compiled branch for non-emptiness *
1681 *************************************************/
1683 /* This function scans through a branch of a compiled pattern to see whether it
1684 can match the empty string or not. It is called only from could_be_empty()
1685 below. Note that first_significant_code() skips over assertions. If we hit an
1686 unclosed bracket, we return "empty" - this means we've struck an inner bracket
1687 whose current branch will already have been scanned.
1690 code points to start of search
1691 endcode points to where to stop
1692 utf8 TRUE if in UTF8 mode
1694 Returns: TRUE if what is matched could be empty
1698 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1701 for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
1703 code = first_significant_code(code + OP_lengths[c], NULL, 0, TRUE))
1705 const uschar *ccode;
1712 if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
1714 /* Scan a closed bracket */
1716 empty_branch = FALSE;
1719 if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1720 empty_branch = TRUE;
1721 code += GET(code, 1);
1723 while (*code == OP_ALT);
1724 if (!empty_branch) return FALSE; /* All branches are non-empty */
1725 code += 1 + LINK_SIZE;
1731 /* Check for quantifiers after a class */
1735 ccode = code + GET(code, 1);
1736 goto CHECK_CLASS_REPEAT;
1749 case OP_CRSTAR: /* These could be empty; continue */
1755 default: /* Non-repeat => class must match */
1756 case OP_CRPLUS: /* These repeats aren't empty */
1762 if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
1767 /* Opcodes that must match a character */
1774 case OP_NOT_WHITESPACE:
1776 case OP_NOT_WORDCHAR:
1790 case OP_TYPEMINPLUS:
1802 /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
1803 followed by a multibyte character */
1812 if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1823 /*************************************************
1824 * Scan compiled regex for non-emptiness *
1825 *************************************************/
1827 /* This function is called to check for left recursive calls. We want to check
1828 the current branch of the current pattern to see if it could match the empty
1829 string. If it could, we must look outwards for branches at other levels,
1830 stopping when we pass beyond the bracket which is the subject of the recursion.
1833 code points to start of the recursion
1834 endcode points to where to stop (current RECURSE item)
1835 bcptr points to the chain of current (unclosed) branch starts
1836 utf8 TRUE if in UTF-8 mode
1838 Returns: TRUE if what is matched could be empty
1842 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1845 while (bcptr != NULL && bcptr->current >= code)
1847 if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1848 bcptr = bcptr->outer;
1855 /*************************************************
1856 * Check for POSIX class syntax *
1857 *************************************************/
1859 /* This function is called when the sequence "[:" or "[." or "[=" is
1860 encountered in a character class. It checks whether this is followed by an
1861 optional ^ and then a sequence of letters, terminated by a matching ":]" or
1865 ptr pointer to the initial [
1866 endptr where to return the end pointer
1867 cd pointer to compile data
1869 Returns: TRUE or FALSE
1873 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
1875 int terminator; /* Don't combine these lines; the Solaris cc */
1876 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
1877 if (*(++ptr) == '^') ptr++;
1878 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
1879 if (*ptr == terminator && ptr[1] == ']')
1890 /*************************************************
1891 * Check POSIX class name *
1892 *************************************************/
1894 /* This function is called to check the name given in a POSIX-style class entry
1898 ptr points to the first letter
1899 len the length of the name
1901 Returns: a value representing the name, or -1 if unknown
1905 check_posix_name(const uschar *ptr, int len)
1907 register int yield = 0;
1908 while (posix_name_lengths[yield] != 0)
1910 if (len == posix_name_lengths[yield] &&
1911 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
1918 /*************************************************
1919 * Adjust OP_RECURSE items in repeated group *
1920 *************************************************/
1922 /* OP_RECURSE items contain an offset from the start of the regex to the group
1923 that is referenced. This means that groups can be replicated for fixed
1924 repetition simply by copying (because the recursion is allowed to refer to
1925 earlier groups that are outside the current group). However, when a group is
1926 optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
1927 it, after it has been compiled. This means that any OP_RECURSE items within it
1928 that refer to the group itself or any contained groups have to have their
1929 offsets adjusted. That is the job of this function. Before it is called, the
1930 partially compiled regex must be temporarily terminated with OP_END.
1933 group points to the start of the group
1934 adjust the amount by which the group is to be moved
1935 utf8 TRUE in UTF-8 mode
1936 cd contains pointers to tables etc.
1942 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
1944 uschar *ptr = group;
1945 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
1947 int offset = GET(ptr, 1);
1948 if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
1949 ptr += 1 + LINK_SIZE;
1955 /*************************************************
1956 * Insert an automatic callout point *
1957 *************************************************/
1959 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
1960 callout points before each pattern item.
1963 code current code pointer
1964 ptr current pattern pointer
1965 cd pointers to tables etc
1967 Returns: new code pointer
1971 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
1973 *code++ = OP_CALLOUT;
1975 PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
1976 PUT(code, LINK_SIZE, 0); /* Default length */
1977 return code + 2*LINK_SIZE;
1982 /*************************************************
1983 * Complete a callout item *
1984 *************************************************/
1986 /* A callout item contains the length of the next item in the pattern, which
1987 we can't fill in till after we have reached the relevant point. This is used
1988 for both automatic and manual callouts.
1991 previous_callout points to previous callout item
1992 ptr current pattern pointer
1993 cd pointers to tables etc
1999 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2001 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2002 PUT(previous_callout, 2 + LINK_SIZE, length);
2008 /*************************************************
2009 * Get othercase range *
2010 *************************************************/
2012 /* This function is passed the start and end of a class range, in UTF-8 mode
2013 with UCP support. It searches up the characters, looking for internal ranges of
2014 characters in the "other" case. Each call returns the next one, updating the
2018 cptr points to starting character value; updated
2020 ocptr where to put start of othercase range
2021 odptr where to put end of othercase range
2023 Yield: TRUE when range returned; FALSE when no more
2027 get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
2029 int c, chartype, othercase, next;
2031 for (c = *cptr; c <= d; c++)
2033 if (ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0) break;
2036 if (c > d) return FALSE;
2039 next = othercase + 1;
2041 for (++c; c <= d; c++)
2043 if (ucp_findchar(c, &chartype, &othercase) != ucp_L || othercase != next)
2053 #endif /* SUPPORT_UCP */
2056 /*************************************************
2057 * Compile one branch *
2058 *************************************************/
2060 /* Scan the pattern, compiling it into the code vector. If the options are
2061 changed during the branch, the pointer is used to change the external options
2065 optionsptr pointer to the option bits
2066 brackets points to number of extracting brackets used
2067 codeptr points to the pointer to the current code point
2068 ptrptr points to the current pattern pointer
2069 errorptr points to pointer to error message
2070 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2071 reqbyteptr set to the last literal character required, else < 0
2072 bcptr points to current branch chain
2073 cd contains pointers to tables etc.
2075 Returns: TRUE on success
2076 FALSE, with *errorptr set on error
2080 compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
2081 const uschar **ptrptr, const char **errorptr, int *firstbyteptr,
2082 int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
2084 int repeat_type, op_type;
2085 int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
2087 int greedy_default, greedy_non_default;
2088 int firstbyte, reqbyte;
2089 int zeroreqbyte, zerofirstbyte;
2090 int req_caseopt, reqvary, tempreqvary;
2092 int options = *optionsptr;
2093 int after_manual_callout = 0;
2095 register uschar *code = *codeptr;
2097 BOOL inescq = FALSE;
2098 BOOL groupsetfirstbyte = FALSE;
2099 const uschar *ptr = *ptrptr;
2100 const uschar *tempptr;
2101 uschar *previous = NULL;
2102 uschar *previous_callout = NULL;
2103 uschar classbits[32];
2107 BOOL utf8 = (options & PCRE_UTF8) != 0;
2108 uschar *class_utf8data;
2109 uschar utf8_char[6];
2114 /* Set up the default and non-default settings for greediness */
2116 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2117 greedy_non_default = greedy_default ^ 1;
2119 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2120 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2121 matches a non-fixed char first char; reqbyte just remains unset if we never
2124 When we hit a repeat whose minimum is zero, we may have to adjust these values
2125 to take the zero repeat into account. This is implemented by setting them to
2126 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2127 item types that can be repeated set these backoff variables appropriately. */
2129 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2131 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2132 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2133 value > 255. It is added into the firstbyte or reqbyte variables to record the
2134 case status of the value. This is used only for ASCII characters. */
2136 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2138 /* Switch on next character until the end of the branch */
2143 BOOL possessive_quantifier;
2145 int class_charcount;
2155 /* Next byte in the pattern */
2159 /* If in \Q...\E, check for the end; if not, we have a literal */
2161 if (inescq && c != 0)
2163 if (c == '\\' && ptr[1] == 'E')
2171 if (previous_callout != NULL)
2173 complete_callout(previous_callout, ptr, cd);
2174 previous_callout = NULL;
2176 if ((options & PCRE_AUTO_CALLOUT) != 0)
2178 previous_callout = code;
2179 code = auto_callout(code, ptr, cd);
2185 /* Fill in length of a previous callout, except when the next thing is
2188 is_quantifier = c == '*' || c == '+' || c == '?' ||
2189 (c == '{' && is_counted_repeat(ptr+1));
2191 if (!is_quantifier && previous_callout != NULL &&
2192 after_manual_callout-- <= 0)
2194 complete_callout(previous_callout, ptr, cd);
2195 previous_callout = NULL;
2198 /* In extended mode, skip white space and comments */
2200 if ((options & PCRE_EXTENDED) != 0)
2202 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2205 /* The space before the ; is to avoid a warning on a silly compiler
2206 on the Macintosh. */
2207 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2208 if (c != 0) continue; /* Else fall through to handle end of string */
2212 /* No auto callout for quantifiers. */
2214 if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2216 previous_callout = code;
2217 code = auto_callout(code, ptr, cd);
2222 /* The branch terminates at end of string, |, or ). */
2227 *firstbyteptr = firstbyte;
2228 *reqbyteptr = reqbyte;
2233 /* Handle single-character metacharacters. In multiline mode, ^ disables
2234 the setting of any following char as a first character. */
2237 if ((options & PCRE_MULTILINE) != 0)
2239 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2250 /* There can never be a first char if '.' is first, whatever happens about
2251 repeats. The value of reqbyte doesn't change either. */
2254 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2255 zerofirstbyte = firstbyte;
2256 zeroreqbyte = reqbyte;
2261 /* Character classes. If the included characters are all < 255 in value, we
2262 build a 32-byte bitmap of the permitted characters, except in the special
2263 case where there is only one such character. For negated classes, we build
2264 the map as usual, then invert it at the end. However, we use a different
2265 opcode so that data characters > 255 can be handled correctly.
2267 If the class contains characters outside the 0-255 range, a different
2268 opcode is compiled. It may optionally have a bit map for characters < 256,
2269 but those above are are explicitly listed afterwards. A flag byte tells
2270 whether the bitmap is present, and whether this is a negated class or not.
2276 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2277 they are encountered at the top level, so we'll do that too. */
2279 if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2280 check_posix_syntax(ptr, &tempptr, cd))
2282 *errorptr = (ptr[1] == ':')? ERR13 : ERR31;
2286 /* If the first character is '^', set the negation flag and skip it. */
2288 if ((c = *(++ptr)) == '^')
2290 negate_class = TRUE;
2295 negate_class = FALSE;
2298 /* Keep a count of chars with values < 256 so that we can optimize the case
2299 of just a single character (as long as it's < 256). For higher valued UTF-8
2300 characters, we don't yet do any optimization. */
2302 class_charcount = 0;
2303 class_lastchar = -1;
2306 class_utf8 = FALSE; /* No chars >= 256 */
2307 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
2310 /* Initialize the 32-char bit map to all zeros. We have to build the
2311 map in a temporary bit of store, in case the class contains only 1
2312 character (< 256), because in that case the compiled code doesn't use the
2315 memset(classbits, 0, 32 * sizeof(uschar));
2317 /* Process characters until ] is reached. By writing this as a "do" it
2318 means that an initial ] is taken as a data character. The first pass
2319 through the regex checked the overall syntax, so we don't need to be very
2320 strict here. At the start of the loop, c contains the first byte of the
2326 if (utf8 && c > 127)
2327 { /* Braces are required because the */
2328 GETCHARLEN(c, ptr, ptr); /* macro generates multiple statements */
2332 /* Inside \Q...\E everything is literal except \E */
2336 if (c == '\\' && ptr[1] == 'E')
2342 else goto LONE_SINGLE_CHARACTER;
2345 /* Handle POSIX class names. Perl allows a negation extension of the
2346 form [:^name:]. A square bracket that doesn't match the syntax is
2347 treated as a literal. We also recognize the POSIX constructions
2348 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2352 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
2353 check_posix_syntax(ptr, &tempptr, cd))
2355 BOOL local_negate = FALSE;
2357 register const uschar *cbits = cd->cbits;
2368 local_negate = TRUE;
2372 posix_class = check_posix_name(ptr, tempptr - ptr);
2373 if (posix_class < 0)
2379 /* If matching is caseless, upper and lower are converted to
2380 alpha. This relies on the fact that the class table starts with
2381 alpha, lower, upper as the first 3 entries. */
2383 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
2386 /* Or into the map we are building up to 3 of the static class
2387 tables, or their negations. The [:blank:] class sets up the same
2388 chars as the [:space:] class (all white space). We remove the vertical
2389 white space chars afterwards. */
2392 for (i = 0; i < 3; i++)
2394 BOOL blankclass = strncmp((char *)ptr, "blank", 5) == 0;
2395 int taboffset = posix_class_maps[posix_class + i];
2396 if (taboffset < 0) break;
2400 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+taboffset];
2402 for (c = 0; c < 32; c++) classbits[c] &= ~cbits[c+taboffset];
2403 if (blankclass) classbits[1] |= 0x3c;
2407 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+taboffset];
2408 if (blankclass) classbits[1] &= ~0x3c;
2413 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
2414 continue; /* End of POSIX syntax handling */
2417 /* Backslash may introduce a single character, or it may introduce one
2418 of the specials, which just set a flag. Escaped items are checked for
2419 validity in the pre-compiling pass. The sequence \b is a special case.
2420 Inside a class (and only there) it is treated as backspace. Elsewhere
2421 it marks a word boundary. Other escapes have preset maps ready to
2422 or into the one we are building. We assume they have more than one
2423 character in them, so set class_charcount bigger than one. */
2427 c = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2429 if (-c == ESC_b) c = '\b'; /* \b is backslash in a class */
2430 else if (-c == ESC_X) c = 'X'; /* \X is literal X in a class */
2431 else if (-c == ESC_Q) /* Handle start of quoted string */
2433 if (ptr[1] == '\\' && ptr[2] == 'E')
2435 ptr += 2; /* avoid empty string */
2443 register const uschar *cbits = cd->cbits;
2444 class_charcount += 2; /* Greater than 1 is what matters */
2448 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
2452 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
2456 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
2460 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
2464 for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
2465 classbits[1] &= ~0x08; /* Perl 5.004 onwards omits VT from \s */
2469 for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
2470 classbits[1] |= 0x08; /* Perl 5.004 onwards omits VT from \s */
2478 int property = get_ucp(&ptr, &negated, errorptr);
2479 if (property < 0) goto FAILED;
2481 *class_utf8data++ = ((-c == ESC_p) != negated)?
2482 XCL_PROP : XCL_NOTPROP;
2483 *class_utf8data++ = property;
2484 class_charcount -= 2; /* Not a < 256 character */
2489 /* Unrecognized escapes are faulted if PCRE is running in its
2490 strict mode. By default, for compatibility with Perl, they are
2491 treated as literals. */
2494 if ((options & PCRE_EXTRA) != 0)
2499 c = *ptr; /* The final character */
2500 class_charcount -= 2; /* Undo the default count from above */
2504 /* Fall through if we have a single character (c >= 0). This may be
2505 > 256 in UTF-8 mode. */
2507 } /* End of backslash handling */
2509 /* A single character may be followed by '-' to form a range. However,
2510 Perl does not permit ']' to be the end of the range. A '-' character
2511 here is treated as a literal. */
2513 if (ptr[1] == '-' && ptr[2] != ']')
2520 { /* Braces are required because the */
2521 GETCHARLEN(d, ptr, ptr); /* macro generates multiple statements */
2525 d = *ptr; /* Not UTF-8 mode */
2527 /* The second part of a range can be a single-character escape, but
2528 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
2529 in such circumstances. */
2533 const uschar *oldptr = ptr;
2534 d = check_escape(&ptr, errorptr, *brackets, options, TRUE);
2536 /* \b is backslash; \X is literal X; any other special means the '-'
2541 if (d == -ESC_b) d = '\b';
2542 else if (d == -ESC_X) d = 'X'; else
2545 goto LONE_SINGLE_CHARACTER; /* A few lines below */
2550 /* The check that the two values are in the correct order happens in
2551 the pre-pass. Optimize one-character ranges */
2553 if (d == c) goto LONE_SINGLE_CHARACTER; /* A few lines below */
2555 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
2556 matching, we have to use an XCLASS with extra data items. Caseless
2557 matching for characters > 127 is available only if UCP support is
2561 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
2565 /* With UCP support, we can find the other case equivalents of
2566 the relevant characters. There may be several ranges. Optimize how
2567 they fit with the basic range. */
2570 if ((options & PCRE_CASELESS) != 0)
2575 while (get_othercase_range(&cc, origd, &occ, &ocd))
2577 if (occ >= c && ocd <= d) continue; /* Skip embedded ranges */
2579 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2580 { /* if there is overlap, */
2581 c = occ; /* noting that if occ < c */
2582 continue; /* we can't have ocd > d */
2583 } /* because a subrange is */
2584 if (ocd > d && occ <= d + 1) /* always shorter than */
2585 { /* the basic range. */
2592 *class_utf8data++ = XCL_SINGLE;
2596 *class_utf8data++ = XCL_RANGE;
2597 class_utf8data += ord2utf8(occ, class_utf8data);
2599 class_utf8data += ord2utf8(ocd, class_utf8data);
2602 #endif /* SUPPORT_UCP */
2604 /* Now record the original range, possibly modified for UCP caseless
2605 overlapping ranges. */
2607 *class_utf8data++ = XCL_RANGE;
2608 class_utf8data += ord2utf8(c, class_utf8data);
2609 class_utf8data += ord2utf8(d, class_utf8data);
2611 /* With UCP support, we are done. Without UCP support, there is no
2612 caseless matching for UTF-8 characters > 127; we can use the bit map
2613 for the smaller ones. */
2616 continue; /* With next character in the class */
2618 if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
2620 /* Adjust upper limit and fall through to set up the map */
2624 #endif /* SUPPORT_UCP */
2626 #endif /* SUPPORT_UTF8 */
2628 /* We use the bit map for all cases when not in UTF-8 mode; else
2629 ranges that lie entirely within 0-127 when there is UCP support; else
2630 for partial ranges without UCP support. */
2634 classbits[c/8] |= (1 << (c&7));
2635 if ((options & PCRE_CASELESS) != 0)
2637 int uc = cd->fcc[c]; /* flip case */
2638 classbits[uc/8] |= (1 << (uc&7));
2640 class_charcount++; /* in case a one-char range */
2644 continue; /* Go get the next char in the class */
2647 /* Handle a lone single character - we can get here for a normal
2648 non-escape char, or after \ that introduces a single character or for an
2649 apparent range that isn't. */
2651 LONE_SINGLE_CHARACTER:
2653 /* Handle a character that cannot go in the bit map */
2656 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
2659 *class_utf8data++ = XCL_SINGLE;
2660 class_utf8data += ord2utf8(c, class_utf8data);
2663 if ((options & PCRE_CASELESS) != 0)
2667 if (ucp_findchar(c, &chartype, &othercase) >= 0 && othercase > 0)
2669 *class_utf8data++ = XCL_SINGLE;
2670 class_utf8data += ord2utf8(othercase, class_utf8data);
2673 #endif /* SUPPORT_UCP */
2677 #endif /* SUPPORT_UTF8 */
2679 /* Handle a single-byte character */
2681 classbits[c/8] |= (1 << (c&7));
2682 if ((options & PCRE_CASELESS) != 0)
2684 c = cd->fcc[c]; /* flip case */
2685 classbits[c/8] |= (1 << (c&7));
2692 /* Loop until ']' reached; the check for end of string happens inside the
2693 loop. This "while" is the end of the "do" above. */
2695 while ((c = *(++ptr)) != ']' || inescq);
2697 /* If class_charcount is 1, we saw precisely one character whose value is
2698 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
2699 can optimize the negative case only if there were no characters >= 128
2700 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
2701 single-bytes only. This is an historical hangover. Maybe one day we can
2702 tidy these opcodes to handle multi-byte characters.
2704 The optimization throws away the bit map. We turn the item into a
2705 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
2706 that OP_NOT does not support multibyte characters. In the positive case, it
2707 can cause firstbyte to be set. Otherwise, there can be no first char if
2708 this item is first, whatever repeat count may follow. In the case of
2709 reqbyte, save the previous value for reinstating. */
2712 if (class_charcount == 1 &&
2714 (!class_utf8 && (!negate_class || class_lastchar < 128))))
2717 if (class_charcount == 1)
2720 zeroreqbyte = reqbyte;
2722 /* The OP_NOT opcode works on one-byte characters only. */
2726 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2727 zerofirstbyte = firstbyte;
2729 *code++ = class_lastchar;
2733 /* For a single, positive character, get the value into mcbuffer, and
2734 then we can handle this with the normal one-character code. */
2737 if (utf8 && class_lastchar > 127)
2738 mclength = ord2utf8(class_lastchar, mcbuffer);
2742 mcbuffer[0] = class_lastchar;
2746 } /* End of 1-char optimization */
2748 /* The general case - not the one-char optimization. If this is the first
2749 thing in the branch, there can be no first char setting, whatever the
2750 repeat count. Any reqbyte setting must remain unchanged after any kind of
2753 if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2754 zerofirstbyte = firstbyte;
2755 zeroreqbyte = reqbyte;
2757 /* If there are characters with values > 255, we have to compile an
2758 extended class, with its own opcode. If there are no characters < 256,
2759 we can omit the bitmap. */
2764 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
2765 *code++ = OP_XCLASS;
2767 *code = negate_class? XCL_NOT : 0;
2769 /* If the map is required, install it, and move on to the end of
2772 if (class_charcount > 0)
2775 memcpy(code, classbits, 32);
2776 code = class_utf8data;
2779 /* If the map is not required, slide down the extra data. */
2783 int len = class_utf8data - (code + 33);
2784 memmove(code + 1, code + 33, len);
2788 /* Now fill in the complete length of the item */
2790 PUT(previous, 1, code - previous);
2791 break; /* End of class handling */
2795 /* If there are no characters > 255, negate the 32-byte map if necessary,
2796 and copy it into the code vector. If this is the first thing in the branch,
2797 there can be no first char setting, whatever the repeat count. Any reqbyte
2798 setting must remain unchanged after any kind of repeat. */
2802 *code++ = OP_NCLASS;
2803 for (c = 0; c < 32; c++) code[c] = ~classbits[c];
2808 memcpy(code, classbits, 32);
2813 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
2814 has been tested above. */
2817 if (!is_quantifier) goto NORMAL_CHAR;
2818 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr);
2819 if (*errorptr != NULL) goto FAILED;
2837 if (previous == NULL)
2843 if (repeat_min == 0)
2845 firstbyte = zerofirstbyte; /* Adjust for zero repeat */
2846 reqbyte = zeroreqbyte; /* Ditto */
2849 /* Remember whether this is a variable length repeat */
2851 reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
2853 op_type = 0; /* Default single-char op codes */
2854 possessive_quantifier = FALSE; /* Default not possessive quantifier */
2856 /* Save start of previous item, in case we have to move it up to make space
2857 for an inserted OP_ONCE for the additional '+' extension. */
2859 tempcode = previous;
2861 /* If the next character is '+', we have a possessive quantifier. This
2862 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
2863 If the next character is '?' this is a minimizing repeat, by default,
2864 but if PCRE_UNGREEDY is set, it works the other way round. We change the
2865 repeat type to the non-default. */
2869 repeat_type = 0; /* Force greedy */
2870 possessive_quantifier = TRUE;
2873 else if (ptr[1] == '?')
2875 repeat_type = greedy_non_default;
2878 else repeat_type = greedy_default;
2880 /* If previous was a recursion, we need to wrap it inside brackets so that
2881 it can be replicated if necessary. */
2883 if (*previous == OP_RECURSE)
2885 memmove(previous + 1 + LINK_SIZE, previous, 1 + LINK_SIZE);
2886 code += 1 + LINK_SIZE;
2888 PUT(previous, 1, code - previous);
2890 PUT(code, 1, code - previous);
2891 code += 1 + LINK_SIZE;
2894 /* If previous was a character match, abolish the item and generate a
2895 repeat item instead. If a char item has a minumum of more than one, ensure
2896 that it is set in reqbyte - it might not be if a sequence such as x{3} is
2897 the first thing in a branch because the x will have gone into firstbyte
2900 if (*previous == OP_CHAR || *previous == OP_CHARNC)
2902 /* Deal with UTF-8 characters that take up more than one byte. It's
2903 easier to write this out separately than try to macrify it. Use c to
2904 hold the length of the character in bytes, plus 0x80 to flag that it's a
2905 length rather than a small character. */
2908 if (utf8 && (code[-1] & 0x80) != 0)
2910 uschar *lastchar = code - 1;
2911 while((*lastchar & 0xc0) == 0x80) lastchar--;
2912 c = code - lastchar; /* Length of UTF-8 character */
2913 memcpy(utf8_char, lastchar, c); /* Save the char */
2914 c |= 0x80; /* Flag c as a length */
2919 /* Handle the case of a single byte - either with no UTF8 support, or
2920 with UTF-8 disabled, or for a UTF-8 character < 128. */
2924 if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
2927 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
2930 /* If previous was a single negated character ([^a] or similar), we use
2931 one of the special opcodes, replacing it. The code is shared with single-
2932 character repeats by setting opt_type to add a suitable offset into
2933 repeat_type. OP_NOT is currently used only for single-byte chars. */
2935 else if (*previous == OP_NOT)
2937 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
2939 goto OUTPUT_SINGLE_REPEAT;
2942 /* If previous was a character type match (\d or similar), abolish it and
2943 create a suitable repeat item. The code is shared with single-character
2944 repeats by setting op_type to add a suitable offset into repeat_type. Note
2945 the the Unicode property types will be present only when SUPPORT_UCP is
2946 defined, but we don't wrap the little bits of code here because it just
2947 makes it horribly messy. */
2949 else if (*previous < OP_EODN)
2953 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
2956 OUTPUT_SINGLE_REPEAT:
2957 prop_type = (*previous == OP_PROP || *previous == OP_NOTPROP)?
2961 code = previous; /* Usually overwrite previous item */
2963 /* If the maximum is zero then the minimum must also be zero; Perl allows
2964 this case, so we do too - by simply omitting the item altogether. */
2966 if (repeat_max == 0) goto END_REPEAT;
2968 /* All real repeats make it impossible to handle partial matching (maybe
2969 one day we will be able to remove this restriction). */
2971 if (repeat_max != 1) cd->nopartial = TRUE;
2973 /* Combine the op_type with the repeat_type */
2975 repeat_type += op_type;
2977 /* A minimum of zero is handled either as the special case * or ?, or as
2978 an UPTO, with the maximum given. */
2980 if (repeat_min == 0)
2982 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
2983 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
2986 *code++ = OP_UPTO + repeat_type;
2987 PUT2INC(code, 0, repeat_max);
2991 /* A repeat minimum of 1 is optimized into some special cases. If the
2992 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
2993 left in place and, if the maximum is greater than 1, we use OP_UPTO with
2994 one less than the maximum. */
2996 else if (repeat_min == 1)
2998 if (repeat_max == -1)
2999 *code++ = OP_PLUS + repeat_type;
3002 code = oldcode; /* leave previous item in place */
3003 if (repeat_max == 1) goto END_REPEAT;
3004 *code++ = OP_UPTO + repeat_type;
3005 PUT2INC(code, 0, repeat_max - 1);
3009 /* The case {n,n} is just an EXACT, while the general case {n,m} is
3010 handled as an EXACT followed by an UPTO. */
3014 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
3015 PUT2INC(code, 0, repeat_min);
3017 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3018 we have to insert the character for the previous code. For a repeated
3019 Unicode property match, there is an extra byte that defines the
3020 required property. In UTF-8 mode, long characters have their length in
3021 c, with the 0x80 bit as a flag. */
3026 if (utf8 && c >= 128)
3028 memcpy(code, utf8_char, c & 7);
3035 if (prop_type >= 0) *code++ = prop_type;
3037 *code++ = OP_STAR + repeat_type;
3040 /* Else insert an UPTO if the max is greater than the min, again
3041 preceded by the character, for the previously inserted code. */
3043 else if (repeat_max != repeat_min)
3046 if (utf8 && c >= 128)
3048 memcpy(code, utf8_char, c & 7);
3054 if (prop_type >= 0) *code++ = prop_type;
3055 repeat_max -= repeat_min;
3056 *code++ = OP_UPTO + repeat_type;
3057 PUT2INC(code, 0, repeat_max);
3061 /* The character or character type itself comes last in all cases. */
3064 if (utf8 && c >= 128)
3066 memcpy(code, utf8_char, c & 7);
3073 /* For a repeated Unicode property match, there is an extra byte that
3074 defines the required property. */
3077 if (prop_type >= 0) *code++ = prop_type;
3081 /* If previous was a character class or a back reference, we put the repeat
3082 stuff after it, but just skip the item if the repeat was {0,0}. */
3084 else if (*previous == OP_CLASS ||
3085 *previous == OP_NCLASS ||
3087 *previous == OP_XCLASS ||
3089 *previous == OP_REF)
3091 if (repeat_max == 0)
3097 /* All real repeats make it impossible to handle partial matching (maybe
3098 one day we will be able to remove this restriction). */
3100 if (repeat_max != 1) cd->nopartial = TRUE;
3102 if (repeat_min == 0 && repeat_max == -1)
3103 *code++ = OP_CRSTAR + repeat_type;
3104 else if (repeat_min == 1 && repeat_max == -1)
3105 *code++ = OP_CRPLUS + repeat_type;
3106 else if (repeat_min == 0 && repeat_max == 1)
3107 *code++ = OP_CRQUERY + repeat_type;
3110 *code++ = OP_CRRANGE + repeat_type;
3111 PUT2INC(code, 0, repeat_min);
3112 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
3113 PUT2INC(code, 0, repeat_max);
3117 /* If previous was a bracket group, we may have to replicate it in certain
3120 else if (*previous >= OP_BRA || *previous == OP_ONCE ||
3121 *previous == OP_COND)
3125 int len = code - previous;
3126 uschar *bralink = NULL;
3128 /* If the maximum repeat count is unlimited, find the end of the bracket
3129 by scanning through from the start, and compute the offset back to it
3130 from the current code pointer. There may be an OP_OPT setting following
3131 the final KET, so we can't find the end just by going back from the code
3134 if (repeat_max == -1)
3136 register uschar *ket = previous;
3137 do ket += GET(ket, 1); while (*ket != OP_KET);
3138 ketoffset = code - ket;
3141 /* The case of a zero minimum is special because of the need to stick
3142 OP_BRAZERO in front of it, and because the group appears once in the
3143 data, whereas in other cases it appears the minimum number of times. For
3144 this reason, it is simplest to treat this case separately, as otherwise
3145 the code gets far too messy. There are several special subcases when the
3148 if (repeat_min == 0)
3150 /* If the maximum is also zero, we just omit the group from the output
3153 if (repeat_max == 0)
3159 /* If the maximum is 1 or unlimited, we just have to stick in the
3160 BRAZERO and do no more at this point. However, we do need to adjust
3161 any OP_RECURSE calls inside the group that refer to the group itself or
3162 any internal group, because the offset is from the start of the whole
3163 regex. Temporarily terminate the pattern while doing this. */
3165 if (repeat_max <= 1)
3168 adjust_recurse(previous, 1, utf8, cd);
3169 memmove(previous+1, previous, len);
3171 *previous++ = OP_BRAZERO + repeat_type;
3174 /* If the maximum is greater than 1 and limited, we have to replicate
3175 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
3176 The first one has to be handled carefully because it's the original
3177 copy, which has to be moved up. The remainder can be handled by code
3178 that is common with the non-zero minimum case below. We have to
3179 adjust the value or repeat_max, since one less copy is required. Once
3180 again, we may have to adjust any OP_RECURSE calls inside the group. */
3186 adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd);
3187 memmove(previous + 2 + LINK_SIZE, previous, len);
3188 code += 2 + LINK_SIZE;
3189 *previous++ = OP_BRAZERO + repeat_type;
3190 *previous++ = OP_BRA;
3192 /* We chain together the bracket offset fields that have to be
3193 filled in later when the ends of the brackets are reached. */
3195 offset = (bralink == NULL)? 0 : previous - bralink;
3197 PUTINC(previous, 0, offset);
3203 /* If the minimum is greater than zero, replicate the group as many
3204 times as necessary, and adjust the maximum to the number of subsequent
3205 copies that we need. If we set a first char from the group, and didn't
3206 set a required char, copy the latter from the former. */
3212 if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
3213 for (i = 1; i < repeat_min; i++)
3215 memcpy(code, previous, len);
3219 if (repeat_max > 0) repeat_max -= repeat_min;
3222 /* This code is common to both the zero and non-zero minimum cases. If
3223 the maximum is limited, it replicates the group in a nested fashion,
3224 remembering the bracket starts on a stack. In the case of a zero minimum,
3225 the first one was set up above. In all cases the repeat_max now specifies
3226 the number of additional copies needed. */
3228 if (repeat_max >= 0)
3230 for (i = repeat_max - 1; i >= 0; i--)
3232 *code++ = OP_BRAZERO + repeat_type;
3234 /* All but the final copy start a new nesting, maintaining the
3235 chain of brackets outstanding. */
3241 offset = (bralink == NULL)? 0 : code - bralink;
3243 PUTINC(code, 0, offset);
3246 memcpy(code, previous, len);
3250 /* Now chain through the pending brackets, and fill in their length
3251 fields (which are holding the chain links pro tem). */
3253 while (bralink != NULL)
3256 int offset = code - bralink + 1;
3257 uschar *bra = code - offset;
3258 oldlinkoffset = GET(bra, 1);
3259 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
3261 PUTINC(code, 0, offset);
3262 PUT(bra, 1, offset);
3266 /* If the maximum is unlimited, set a repeater in the final copy. We
3267 can't just offset backwards from the current code point, because we
3268 don't know if there's been an options resetting after the ket. The
3269 correct offset was computed above. */
3271 else code[-ketoffset] = OP_KETRMAX + repeat_type;
3274 /* Else there's some kind of shambles */
3282 /* If the character following a repeat is '+', we wrap the entire repeated
3283 item inside OP_ONCE brackets. This is just syntactic sugar, taken from
3284 Sun's Java package. The repeated item starts at tempcode, not at previous,
3285 which might be the first part of a string whose (former) last char we
3286 repeated. However, we don't support '+' after a greediness '?'. */
3288 if (possessive_quantifier)
3290 int len = code - tempcode;
3291 memmove(tempcode + 1+LINK_SIZE, tempcode, len);
3292 code += 1 + LINK_SIZE;
3293 len += 1 + LINK_SIZE;
3294 tempcode[0] = OP_ONCE;
3296 PUTINC(code, 0, len);
3297 PUT(tempcode, 1, len);
3300 /* In all case we no longer have a previous item. We also set the
3301 "follows varying string" flag for subsequently encountered reqbytes if
3302 it isn't already set and we have just passed a varying length item. */
3306 cd->req_varyopt |= reqvary;
3310 /* Start of nested bracket sub-expression, or comment or lookahead or
3311 lookbehind or option setting or condition. First deal with special things
3312 that can come after a bracket; all are introduced by ?, and the appearance
3313 of any of them means that this is not a referencing group. They were
3314 checked for validity in the first pass over the string, so we don't have to
3315 check for syntax errors here. */
3318 newoptions = options;
3321 if (*(++ptr) == '?')
3328 case '#': /* Comment; skip to ket */
3330 while (*ptr != ')') ptr++;
3333 case ':': /* Non-extracting bracket */
3339 bravalue = OP_COND; /* Conditional group */
3341 /* Condition to test for recursion */
3345 code[1+LINK_SIZE] = OP_CREF;
3346 PUT2(code, 2+LINK_SIZE, CREF_RECURSE);
3351 /* Condition to test for a numbered subpattern match. We know that
3352 if a digit follows ( then there will just be digits until ) because
3353 the syntax was checked in the first pass. */
3355 else if ((digitab[ptr[1]] && ctype_digit) != 0)
3357 int condref; /* Don't amalgamate; some compilers */
3358 condref = *(++ptr) - '0'; /* grumble at autoincrement in declaration */
3359 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
3366 code[1+LINK_SIZE] = OP_CREF;
3367 PUT2(code, 2+LINK_SIZE, condref);
3370 /* For conditions that are assertions, we just fall through, having
3371 set bravalue above. */
3374 case '=': /* Positive lookahead */
3375 bravalue = OP_ASSERT;
3379 case '!': /* Negative lookahead */
3380 bravalue = OP_ASSERT_NOT;
3384 case '<': /* Lookbehinds */
3387 case '=': /* Positive lookbehind */
3388 bravalue = OP_ASSERTBACK;
3392 case '!': /* Negative lookbehind */
3393 bravalue = OP_ASSERTBACK_NOT;
3399 case '>': /* One-time brackets */
3404 case 'C': /* Callout - may be followed by digits; */
3405 previous_callout = code; /* Save for later completion */
3406 after_manual_callout = 1; /* Skip one item before completing */
3407 *code++ = OP_CALLOUT; /* Already checked that the terminating */
3408 { /* closing parenthesis is present. */
3410 while ((digitab[*(++ptr)] & ctype_digit) != 0)
3411 n = n * 10 + *ptr - '0';
3418 PUT(code, 0, ptr - cd->start_pattern + 1); /* Pattern offset */
3419 PUT(code, LINK_SIZE, 0); /* Default length */
3420 code += 2 * LINK_SIZE;
3425 case 'P': /* Named subpattern handling */
3426 if (*(++ptr) == '<') /* Definition */
3429 uschar *slot = cd->name_table;
3430 const uschar *name; /* Don't amalgamate; some compilers */
3431 name = ++ptr; /* grumble at autoincrement in declaration */
3433 while (*ptr++ != '>');
3434 namelen = ptr - name - 1;
3436 for (i = 0; i < cd->names_found; i++)
3438 int crc = memcmp(name, slot+2, namelen);
3441 if (slot[2+namelen] == 0)
3446 crc = -1; /* Current name is substring */
3450 memmove(slot + cd->name_entry_size, slot,
3451 (cd->names_found - i) * cd->name_entry_size);
3454 slot += cd->name_entry_size;
3457 PUT2(slot, 0, *brackets + 1);
3458 memcpy(slot + 2, name, namelen);
3459 slot[2+namelen] = 0;
3461 goto NUMBERED_GROUP;
3464 if (*ptr == '=' || *ptr == '>') /* Reference or recursion */
3468 const uschar *name = ptr;
3469 uschar *slot = cd->name_table;
3471 while (*ptr != ')') ptr++;
3472 namelen = ptr - name;
3474 for (i = 0; i < cd->names_found; i++)
3476 if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
3477 slot += cd->name_entry_size;
3479 if (i >= cd->names_found)
3485 recno = GET2(slot, 0);
3487 if (type == '>') goto HANDLE_RECURSION; /* A few lines below */
3489 /* Back reference */
3493 PUT2INC(code, 0, recno);
3494 cd->backref_map |= (recno < 32)? (1 << recno) : 1;
3495 if (recno > cd->top_backref) cd->top_backref = recno;
3499 /* Should never happen */
3502 case 'R': /* Pattern recursion */
3503 ptr++; /* Same as (?0) */
3506 /* Recursion or "subroutine" call */
3508 case '0': case '1': case '2': case '3': case '4':
3509 case '5': case '6': case '7': case '8': case '9':
3511 const uschar *called;
3513 while((digitab[*ptr] & ctype_digit) != 0)
3514 recno = recno * 10 + *ptr++ - '0';
3516 /* Come here from code above that handles a named recursion */
3522 /* Find the bracket that is being referenced. Temporarily end the
3523 regex in case it doesn't exist. */
3526 called = (recno == 0)?
3527 cd->start_code : find_bracket(cd->start_code, utf8, recno);
3535 /* If the subpattern is still open, this is a recursive call. We
3536 check to see if this is a left recursion that could loop for ever,
3537 and diagnose that case. */
3539 if (GET(called, 1) == 0 && could_be_empty(called, code, bcptr, utf8))
3545 /* Insert the recursion/subroutine item */
3548 PUT(code, 1, called - cd->start_code);
3549 code += 1 + LINK_SIZE;
3553 /* Character after (? not specially recognized */
3555 default: /* Option setting */
3559 while (*ptr != ')' && *ptr != ':')
3563 case '-': optset = &unset; break;
3565 case 'i': *optset |= PCRE_CASELESS; break;
3566 case 'm': *optset |= PCRE_MULTILINE; break;
3567 case 's': *optset |= PCRE_DOTALL; break;
3568 case 'x': *optset |= PCRE_EXTENDED; break;
3569 case 'U': *optset |= PCRE_UNGREEDY; break;
3570 case 'X': *optset |= PCRE_EXTRA; break;
3574 /* Set up the changed option bits, but don't change anything yet. */
3576 newoptions = (options | set) & (~unset);
3578 /* If the options ended with ')' this is not the start of a nested
3579 group with option changes, so the options change at this level. Compile
3580 code to change the ims options if this setting actually changes any of
3581 them. We also pass the new setting back so that it can be put at the
3582 start of any following branches, and when this group ends (if we are in
3583 a group), a resetting item can be compiled.
3585 Note that if this item is right at the start of the pattern, the
3586 options will have been abstracted and made global, so there will be no
3587 change to compile. */
3591 if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
3594 *code++ = newoptions & PCRE_IMS;
3597 /* Change options at this level, and pass them back for use
3598 in subsequent branches. Reset the greedy defaults and the case
3599 value for firstbyte and reqbyte. */
3601 *optionsptr = options = newoptions;
3602 greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
3603 greedy_non_default = greedy_default ^ 1;
3604 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
3606 previous = NULL; /* This item can't be repeated */
3607 continue; /* It is complete */
3610 /* If the options ended with ':' we are heading into a nested group
3611 with possible change of options. Such groups are non-capturing and are
3612 not assertions of any kind. All we need to do is skip over the ':';
3613 the newoptions value is handled below. */
3620 /* If PCRE_NO_AUTO_CAPTURE is set, all unadorned brackets become
3621 non-capturing and behave like (?:...) brackets */
3623 else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
3628 /* Else we have a referencing group; adjust the opcode. If the bracket
3629 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
3630 arrange for the true number to follow later, in an OP_BRANUMBER item. */
3635 if (++(*brackets) > EXTRACT_BASIC_MAX)
3637 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
3638 code[1+LINK_SIZE] = OP_BRANUMBER;
3639 PUT2(code, 2+LINK_SIZE, *brackets);
3642 else bravalue = OP_BRA + *brackets;
3645 /* Process nested bracketed re. Assertions may not be repeated, but other
3646 kinds can be. We copy code into a non-register variable in order to be able
3647 to pass its address because some compilers complain otherwise. Pass in a
3648 new setting for the ims options if they have changed. */
3650 previous = (bravalue >= OP_ONCE)? code : NULL;
3653 tempreqvary = cd->req_varyopt; /* Save value before bracket */
3656 newoptions, /* The complete new option state */
3657 options & PCRE_IMS, /* The previous ims option state */
3658 brackets, /* Extracting bracket count */
3659 &tempcode, /* Where to put code (updated) */
3660 &ptr, /* Input pointer (updated) */
3661 errorptr, /* Where to put an error message */
3662 (bravalue == OP_ASSERTBACK ||
3663 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
3664 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
3665 &subfirstbyte, /* For possible first char */
3666 &subreqbyte, /* For possible last char */
3667 bcptr, /* Current branch chain */
3668 cd)) /* Tables block */
3671 /* At the end of compiling, code is still pointing to the start of the
3672 group, while tempcode has been updated to point past the end of the group
3673 and any option resetting that may follow it. The pattern pointer (ptr)
3674 is on the bracket. */
3676 /* If this is a conditional bracket, check that there are no more than
3677 two branches in the group. */
3679 else if (bravalue == OP_COND)
3688 while (*tc != OP_KET);
3696 /* If there is just one branch, we must not make use of its firstbyte or
3697 reqbyte, because this is equivalent to an empty second branch. */
3699 if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
3702 /* Handle updating of the required and first characters. Update for normal
3703 brackets of all kinds, and conditions with two branches (see code above).
3704 If the bracket is followed by a quantifier with zero repeat, we have to
3705 back off. Hence the definition of zeroreqbyte and zerofirstbyte outside the
3706 main loop so that they can be accessed for the back off. */
3708 zeroreqbyte = reqbyte;
3709 zerofirstbyte = firstbyte;
3710 groupsetfirstbyte = FALSE;
3712 if (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_COND)
3714 /* If we have not yet set a firstbyte in this branch, take it from the
3715 subpattern, remembering that it was set here so that a repeat of more
3716 than one can replicate it as reqbyte if necessary. If the subpattern has
3717 no firstbyte, set "none" for the whole branch. In both cases, a zero
3718 repeat forces firstbyte to "none". */
3720 if (firstbyte == REQ_UNSET)
3722 if (subfirstbyte >= 0)
3724 firstbyte = subfirstbyte;
3725 groupsetfirstbyte = TRUE;
3727 else firstbyte = REQ_NONE;
3728 zerofirstbyte = REQ_NONE;
3731 /* If firstbyte was previously set, convert the subpattern's firstbyte
3732 into reqbyte if there wasn't one, using the vary flag that was in
3733 existence beforehand. */
3735 else if (subfirstbyte >= 0 && subreqbyte < 0)
3736 subreqbyte = subfirstbyte | tempreqvary;
3738 /* If the subpattern set a required byte (or set a first byte that isn't
3739 really the first byte - see above), set it. */
3741 if (subreqbyte >= 0) reqbyte = subreqbyte;
3744 /* For a forward assertion, we take the reqbyte, if set. This can be
3745 helpful if the pattern that follows the assertion doesn't set a different
3746 char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
3747 for an assertion, however because it leads to incorrect effect for patterns
3748 such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
3749 of a firstbyte. This is overcome by a scan at the end if there's no
3750 firstbyte, looking for an asserted first char. */
3752 else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
3754 /* Now update the main code pointer to the end of the group. */
3758 /* Error if hit end of pattern */
3767 /* Check \ for being a real metacharacter; if not, fall through and handle
3768 it as a data character at the start of a string. Escape items are checked
3769 for validity in the pre-compiling pass. */
3773 c = check_escape(&ptr, errorptr, *brackets, options, FALSE);
3775 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
3776 are arranged to be the negation of the corresponding OP_values. For the
3777 back references, the values are ESC_REF plus the reference number. Only
3778 back references and those types that consume a character may be repeated.
3779 We can test for values between ESC_b and ESC_Z for the latter; this may
3780 have to change if any new ones are ever created. */
3784 if (-c == ESC_Q) /* Handle start of quoted string */
3786 if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
3791 /* For metasequences that actually match a character, we disable the
3792 setting of a first character if it hasn't already been set. */
3794 if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
3795 firstbyte = REQ_NONE;
3797 /* Set values to reset to if this is followed by a zero repeat. */
3799 zerofirstbyte = firstbyte;
3800 zeroreqbyte = reqbyte;
3802 /* Back references are handled specially */
3806 int number = -c - ESC_REF;
3809 PUT2INC(code, 0, number);
3812 /* So are Unicode property matches, if supported. We know that get_ucp
3813 won't fail because it was tested in the pre-pass. */
3816 else if (-c == ESC_P || -c == ESC_p)
3819 int value = get_ucp(&ptr, &negated, errorptr);
3821 *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
3826 /* For the rest, we can obtain the OP value by negating the escape
3831 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
3837 /* We have a data character whose value is in c. In UTF-8 mode it may have
3838 a value > 127. We set its representation in the length/buffer, and then
3839 handle it as a data character. */
3842 if (utf8 && c > 127)
3843 mclength = ord2utf8(c, mcbuffer);
3854 /* Handle a literal character. It is guaranteed not to be whitespace or #
3855 when the extended flag is set. If we are in UTF-8 mode, it may be a
3856 multi-byte literal character. */
3864 if (utf8 && (c & 0xc0) == 0xc0)
3866 while ((ptr[1] & 0xc0) == 0x80)
3867 mcbuffer[mclength++] = *(++ptr);
3871 /* At this point we have the character's bytes in mcbuffer, and the length
3872 in mclength. When not in UTF-8 mode, the length is always 1. */
3876 *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
3877 for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
3879 /* Set the first and required bytes appropriately. If no previous first
3880 byte, set it from this character, but revert to none on a zero repeat.
3881 Otherwise, leave the firstbyte value alone, and don't change it on a zero
3884 if (firstbyte == REQ_UNSET)
3886 zerofirstbyte = REQ_NONE;
3887 zeroreqbyte = reqbyte;
3889 /* If the character is more than one byte long, we can set firstbyte
3890 only if it is not to be matched caselessly. */
3892 if (mclength == 1 || req_caseopt == 0)
3894 firstbyte = mcbuffer[0] | req_caseopt;
3895 if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
3897 else firstbyte = reqbyte = REQ_NONE;
3900 /* firstbyte was previously set; we can set reqbyte only the length is
3901 1 or the matching is caseful. */
3905 zerofirstbyte = firstbyte;
3906 zeroreqbyte = reqbyte;
3907 if (mclength == 1 || req_caseopt == 0)
3908 reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
3911 break; /* End of literal character handling */
3913 } /* end of big loop */
3915 /* Control never reaches here by falling through, only by a goto for all the
3916 error states. Pass back the position in the pattern so that it can be displayed
3917 to the user for diagnosing the error. */
3927 /*************************************************
3928 * Compile sequence of alternatives *
3929 *************************************************/
3931 /* On entry, ptr is pointing past the bracket character, but on return
3932 it points to the closing bracket, or vertical bar, or end of string.
3933 The code variable is pointing at the byte into which the BRA operator has been
3934 stored. If the ims options are changed at the start (for a (?ims: group) or
3935 during any branch, we need to insert an OP_OPT item at the start of every
3936 following branch to ensure they get set correctly at run time, and also pass
3937 the new options into every subsequent branch compile.
3940 options option bits, including any changes for this subpattern
3941 oldims previous settings of ims option bits
3942 brackets -> int containing the number of extracting brackets used
3943 codeptr -> the address of the current code pointer
3944 ptrptr -> the address of the current pattern pointer
3945 errorptr -> pointer to error message
3946 lookbehind TRUE if this is a lookbehind assertion
3947 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
3948 firstbyteptr place to put the first required character, or a negative number
3949 reqbyteptr place to put the last required character, or a negative number
3950 bcptr pointer to the chain of currently open branches
3951 cd points to the data block with tables pointers etc.
3953 Returns: TRUE on success
3957 compile_regex(int options, int oldims, int *brackets, uschar **codeptr,
3958 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
3959 int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
3961 const uschar *ptr = *ptrptr;
3962 uschar *code = *codeptr;
3963 uschar *last_branch = code;
3964 uschar *start_bracket = code;
3965 uschar *reverse_count = NULL;
3966 int firstbyte, reqbyte;
3967 int branchfirstbyte, branchreqbyte;
3973 firstbyte = reqbyte = REQ_UNSET;
3975 /* Offset is set zero to mark that this bracket is still open */
3978 code += 1 + LINK_SIZE + skipbytes;
3980 /* Loop for each alternative branch */
3984 /* Handle a change of ims options at the start of the branch */
3986 if ((options & PCRE_IMS) != oldims)
3989 *code++ = options & PCRE_IMS;
3992 /* Set up dummy OP_REVERSE if lookbehind assertion */
3996 *code++ = OP_REVERSE;
3997 reverse_count = code;
4001 /* Now compile the branch */
4003 if (!compile_branch(&options, brackets, &code, &ptr, errorptr,
4004 &branchfirstbyte, &branchreqbyte, &bc, cd))
4010 /* If this is the first branch, the firstbyte and reqbyte values for the
4011 branch become the values for the regex. */
4013 if (*last_branch != OP_ALT)
4015 firstbyte = branchfirstbyte;
4016 reqbyte = branchreqbyte;
4019 /* If this is not the first branch, the first char and reqbyte have to
4020 match the values from all the previous branches, except that if the previous
4021 value for reqbyte didn't have REQ_VARY set, it can still match, and we set
4022 REQ_VARY for the regex. */
4026 /* If we previously had a firstbyte, but it doesn't match the new branch,
4027 we have to abandon the firstbyte for the regex, but if there was previously
4028 no reqbyte, it takes on the value of the old firstbyte. */
4030 if (firstbyte >= 0 && firstbyte != branchfirstbyte)
4032 if (reqbyte < 0) reqbyte = firstbyte;
4033 firstbyte = REQ_NONE;
4036 /* If we (now or from before) have no firstbyte, a firstbyte from the
4037 branch becomes a reqbyte if there isn't a branch reqbyte. */
4039 if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
4040 branchreqbyte = branchfirstbyte;
4042 /* Now ensure that the reqbytes match */
4044 if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
4046 else reqbyte |= branchreqbyte; /* To "or" REQ_VARY */
4049 /* If lookbehind, check that this branch matches a fixed-length string,
4050 and put the length into the OP_REVERSE item. Temporarily mark the end of
4051 the branch with OP_END. */
4057 length = find_fixedlength(last_branch, options);
4058 DPRINTF(("fixed length = %d\n", length));
4061 *errorptr = (length == -2)? ERR36 : ERR25;
4065 PUT(reverse_count, 0, length);
4068 /* Reached end of expression, either ')' or end of pattern. Go back through
4069 the alternative branches and reverse the chain of offsets, with the field in
4070 the BRA item now becoming an offset to the first alternative. If there are
4071 no alternatives, it points to the end of the group. The length in the
4072 terminating ket is always the length of the whole bracketed item. If any of
4073 the ims options were changed inside the group, compile a resetting op-code
4074 following, except at the very end of the pattern. Return leaving the pointer
4075 at the terminating char. */
4079 int length = code - last_branch;
4082 int prev_length = GET(last_branch, 1);
4083 PUT(last_branch, 1, length);
4084 length = prev_length;
4085 last_branch -= length;
4089 /* Fill in the ket */
4092 PUT(code, 1, code - start_bracket);
4093 code += 1 + LINK_SIZE;
4095 /* Resetting option if needed */
4097 if ((options & PCRE_IMS) != oldims && *ptr == ')')
4103 /* Set values to pass back */
4107 *firstbyteptr = firstbyte;
4108 *reqbyteptr = reqbyte;
4112 /* Another branch follows; insert an "or" node. Its length field points back
4113 to the previous branch while the bracket remains open. At the end the chain
4114 is reversed. It's done like this so that the start of the bracket has a
4115 zero offset until it is closed, making it possible to detect recursion. */
4118 PUT(code, 1, code - last_branch);
4119 bc.current = last_branch = code;
4120 code += 1 + LINK_SIZE;
4123 /* Control never reaches here */
4129 /*************************************************
4130 * Check for anchored expression *
4131 *************************************************/
4133 /* Try to find out if this is an anchored regular expression. Consider each
4134 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
4135 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
4136 it's anchored. However, if this is a multiline pattern, then only OP_SOD
4137 counts, since OP_CIRC can match in the middle.
4139 We can also consider a regex to be anchored if OP_SOM starts all its branches.
4140 This is the code for \G, which means "match at start of match position, taking
4141 into account the match offset".
4143 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
4144 because that will try the rest of the pattern at all possible matching points,
4145 so there is no point trying again.... er ....
4147 .... except when the .* appears inside capturing parentheses, and there is a
4148 subsequent back reference to those parentheses. We haven't enough information
4149 to catch that case precisely.
4151 At first, the best we could do was to detect when .* was in capturing brackets
4152 and the highest back reference was greater than or equal to that level.
4153 However, by keeping a bitmap of the first 31 back references, we can catch some
4154 of the more common cases more precisely.
4157 code points to start of expression (the bracket)
4158 options points to the options setting
4159 bracket_map a bitmap of which brackets we are inside while testing; this
4160 handles up to substring 31; after that we just have to take
4161 the less precise approach
4162 backref_map the back reference bitmap
4164 Returns: TRUE or FALSE
4168 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
4169 unsigned int backref_map)
4172 const uschar *scode =
4173 first_significant_code(code + 1+LINK_SIZE, options, PCRE_MULTILINE, FALSE);
4174 register int op = *scode;
4176 /* Capturing brackets */
4182 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4183 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4184 if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
4187 /* Other brackets */
4189 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4191 if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
4194 /* .* is not anchored unless DOTALL is set and it isn't in brackets that
4195 are or may be referenced. */
4197 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
4198 (*options & PCRE_DOTALL) != 0)
4200 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4203 /* Check for explicit anchoring */
4205 else if (op != OP_SOD && op != OP_SOM &&
4206 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
4208 code += GET(code, 1);
4210 while (*code == OP_ALT); /* Loop for each alternative */
4216 /*************************************************
4217 * Check for starting with ^ or .* *
4218 *************************************************/
4220 /* This is called to find out if every branch starts with ^ or .* so that
4221 "first char" processing can be done to speed things up in multiline
4222 matching and for non-DOTALL patterns that start with .* (which must start at
4223 the beginning or after \n). As in the case of is_anchored() (see above), we
4224 have to take account of back references to capturing brackets that contain .*
4225 because in that case we can't make the assumption.
4228 code points to start of expression (the bracket)
4229 bracket_map a bitmap of which brackets we are inside while testing; this
4230 handles up to substring 31; after that we just have to take
4231 the less precise approach
4232 backref_map the back reference bitmap
4234 Returns: TRUE or FALSE
4238 is_startline(const uschar *code, unsigned int bracket_map,
4239 unsigned int backref_map)
4242 const uschar *scode = first_significant_code(code + 1+LINK_SIZE, NULL, 0,
4244 register int op = *scode;
4246 /* Capturing brackets */
4252 if (op > EXTRACT_BASIC_MAX) op = GET2(scode, 2+LINK_SIZE);
4253 new_map = bracket_map | ((op < 32)? (1 << op) : 1);
4254 if (!is_startline(scode, new_map, backref_map)) return FALSE;
4257 /* Other brackets */
4259 else if (op == OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
4260 { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
4262 /* .* means "start at start or after \n" if it isn't in brackets that
4263 may be referenced. */
4265 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
4267 if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
4270 /* Check for explicit circumflex */
4272 else if (op != OP_CIRC) return FALSE;
4274 /* Move on to the next alternative */
4276 code += GET(code, 1);
4278 while (*code == OP_ALT); /* Loop for each alternative */
4284 /*************************************************
4285 * Check for asserted fixed first char *
4286 *************************************************/
4288 /* During compilation, the "first char" settings from forward assertions are
4289 discarded, because they can cause conflicts with actual literals that follow.
4290 However, if we end up without a first char setting for an unanchored pattern,
4291 it is worth scanning the regex to see if there is an initial asserted first
4292 char. If all branches start with the same asserted char, or with a bracket all
4293 of whose alternatives start with the same asserted char (recurse ad lib), then
4294 we return that char, otherwise -1.
4297 code points to start of expression (the bracket)
4298 options pointer to the options (used to check casing changes)
4299 inassert TRUE if in an assertion
4301 Returns: -1 or the fixed first char
4305 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
4307 register int c = -1;
4310 const uschar *scode =
4311 first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
4312 register int op = *scode;
4314 if (op >= OP_BRA) op = OP_BRA;
4325 if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
4327 if (c < 0) c = d; else if (c != d) return -1;
4330 case OP_EXACT: /* Fall through */
4337 if (!inassert) return -1;
4341 if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
4343 else if (c != scode[1]) return -1;
4347 code += GET(code, 1);
4349 while (*code == OP_ALT);
4357 /*************************************************
4358 * Validate a UTF-8 string *
4359 *************************************************/
4361 /* This function is called (optionally) at the start of compile or match, to
4362 validate that a supposed UTF-8 string is actually valid. The early check means
4363 that subsequent code can assume it is dealing with a valid string. The check
4364 can be turned off for maximum performance, but then consequences of supplying
4365 an invalid string are then undefined.
4368 string points to the string
4369 length length of string, or -1 if the string is zero-terminated
4371 Returns: < 0 if the string is a valid UTF-8 string
4372 >= 0 otherwise; the value is the offset of the bad byte
4376 valid_utf8(const uschar *string, int length)
4378 register const uschar *p;
4382 for (p = string; *p != 0; p++);
4383 length = p - string;
4386 for (p = string; length-- > 0; p++)
4389 register int c = *p;
4390 if (c < 128) continue;
4391 if ((c & 0xc0) != 0xc0) return p - string;
4392 ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
4393 if (length < ab) return p - string;
4396 /* Check top bits in the second byte */
4397 if ((*(++p) & 0xc0) != 0x80) return p - string;
4399 /* Check for overlong sequences for each different length */
4402 /* Check for xx00 000x */
4404 if ((c & 0x3e) == 0) return p - string;
4405 continue; /* We know there aren't any more bytes to check */
4407 /* Check for 1110 0000, xx0x xxxx */
4409 if (c == 0xe0 && (*p & 0x20) == 0) return p - string;
4412 /* Check for 1111 0000, xx00 xxxx */
4414 if (c == 0xf0 && (*p & 0x30) == 0) return p - string;
4417 /* Check for 1111 1000, xx00 0xxx */
4419 if (c == 0xf8 && (*p & 0x38) == 0) return p - string;
4422 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */
4424 if (c == 0xfe || c == 0xff ||
4425 (c == 0xfc && (*p & 0x3c) == 0)) return p - string;
4429 /* Check for valid bytes after the 2nd, if any; all must start 10 */
4432 if ((*(++p) & 0xc0) != 0x80) return p - string;
4442 /*************************************************
4443 * Compile a Regular Expression *
4444 *************************************************/
4446 /* This function takes a string and returns a pointer to a block of store
4447 holding a compiled version of the expression.
4450 pattern the regular expression
4451 options various option bits
4452 errorptr pointer to pointer to error text
4453 erroroffset ptr offset in pattern where error was detected
4454 tables pointer to character tables or NULL
4456 Returns: pointer to compiled data block, or NULL on error,
4457 with errorptr and erroroffset set
4461 pcre_compile(const char *pattern, int options, const char **errorptr,
4462 int *erroroffset, const unsigned char *tables)
4465 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
4466 int c, firstbyte, reqbyte;
4468 int branch_extra = 0;
4469 int branch_newextra;
4470 int item_count = -1;
4472 int max_name_size = 0;
4473 int lastitemlength = 0;
4478 BOOL inescq = FALSE;
4479 unsigned int brastackptr = 0;
4482 const uschar *codestart;
4484 compile_data compile_block;
4485 int brastack[BRASTACK_SIZE];
4486 uschar bralenstack[BRASTACK_SIZE];
4488 /* We can't pass back an error message if errorptr is NULL; I guess the best we
4489 can do is just return NULL. */
4491 if (errorptr == NULL) return NULL;
4494 /* However, we can give a message for this error */
4496 if (erroroffset == NULL)
4503 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
4506 utf8 = (options & PCRE_UTF8) != 0;
4507 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
4508 (*erroroffset = valid_utf8((uschar *)pattern, -1)) >= 0)
4514 if ((options & PCRE_UTF8) != 0)
4521 if ((options & ~PUBLIC_OPTIONS) != 0)
4527 /* Set up pointers to the individual character tables */
4529 if (tables == NULL) tables = pcre_default_tables;
4530 compile_block.lcc = tables + lcc_offset;
4531 compile_block.fcc = tables + fcc_offset;
4532 compile_block.cbits = tables + cbits_offset;
4533 compile_block.ctypes = tables + ctypes_offset;
4535 /* Maximum back reference and backref bitmap. This is updated for numeric
4536 references during the first pass, but for named references during the actual
4537 compile pass. The bitmap records up to 31 back references to help in deciding
4538 whether (.*) can be treated as anchored or not. */
4540 compile_block.top_backref = 0;
4541 compile_block.backref_map = 0;
4543 /* Reflect pattern for debugging output */
4545 DPRINTF(("------------------------------------------------------------------\n"));
4546 DPRINTF(("%s\n", pattern));
4548 /* The first thing to do is to make a pass over the pattern to compute the
4549 amount of store required to hold the compiled code. This does not have to be
4550 perfect as long as errors are overestimates. At the same time we can detect any
4551 flag settings right at the start, and extract them. Make an attempt to correct
4552 for any counted white space if an "extended" flag setting appears late in the
4553 pattern. We can't be so clever for #-comments. */
4555 ptr = (const uschar *)(pattern - 1);
4556 while ((c = *(++ptr)) != 0)
4563 /* If we are inside a \Q...\E sequence, all chars are literal */
4567 if ((options & PCRE_AUTO_CALLOUT) != 0) length += 2 + 2*LINK_SIZE;
4571 /* Otherwise, first check for ignored whitespace and comments */
4573 if ((options & PCRE_EXTENDED) != 0)
4575 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
4578 /* The space before the ; is to avoid a warning on a silly compiler
4579 on the Macintosh. */
4580 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
4586 item_count++; /* Is zero for the first non-comment item */
4588 /* Allow space for auto callout before every item except quantifiers. */
4590 if ((options & PCRE_AUTO_CALLOUT) != 0 &&
4591 c != '*' && c != '+' && c != '?' &&
4592 (c != '{' || !is_counted_repeat(ptr + 1)))
4593 length += 2 + 2*LINK_SIZE;
4597 /* A backslashed item may be an escaped data character or it may be a
4601 c = check_escape(&ptr, errorptr, bracount, options, FALSE);
4602 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4604 lastitemlength = 1; /* Default length of last item for repeats */
4606 if (c >= 0) /* Data character */
4608 length += 2; /* For a one-byte character */
4611 if (utf8 && c > 127)
4614 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
4615 if (c <= utf8_table1[i]) break;
4617 lastitemlength += i;
4624 /* If \Q, enter "literal" mode */
4632 /* \X is supported only if Unicode property support is compiled */
4638 goto PCRE_ERROR_RETURN;
4642 /* \P and \p are for Unicode properties, but only when the support has
4643 been compiled. Each item needs 2 bytes. */
4645 else if (-c == ESC_P || -c == ESC_p)
4651 if (get_ucp(&ptr, &negated, errorptr) < 0) goto PCRE_ERROR_RETURN;
4655 goto PCRE_ERROR_RETURN;
4659 /* Other escapes need one byte */
4663 /* A back reference needs an additional 2 bytes, plus either one or 5
4664 bytes for a repeat. We also need to keep the value of the highest
4669 int refnum = -c - ESC_REF;
4670 compile_block.backref_map |= (refnum < 32)? (1 << refnum) : 1;
4671 if (refnum > compile_block.top_backref)
4672 compile_block.top_backref = refnum;
4673 length += 2; /* For single back reference */
4674 if (ptr[1] == '{' && is_counted_repeat(ptr+2))
4676 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
4677 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4678 if ((min == 0 && (max == 1 || max == -1)) ||
4679 (min == 1 && max == -1))
4682 if (ptr[1] == '?') ptr++;
4687 case '^': /* Single-byte metacharacters */
4694 case '*': /* These repeats won't be after brackets; */
4695 case '+': /* those are handled separately */
4698 goto POSESSIVE; /* A few lines below */
4700 /* This covers the cases of braced repeats after a single char, metachar,
4701 class, or back reference. */
4704 if (!is_counted_repeat(ptr+1)) goto NORMAL_CHAR;
4705 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr);
4706 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4708 /* These special cases just insert one extra opcode */
4710 if ((min == 0 && (max == 1 || max == -1)) ||
4711 (min == 1 && max == -1))
4714 /* These cases might insert additional copies of a preceding character. */
4720 length -= lastitemlength; /* Uncount the original char or metachar */
4721 if (min > 0) length += 3 + lastitemlength;
4723 length += lastitemlength + ((max > 0)? 3 : 1);
4726 if (ptr[1] == '?') ptr++; /* Needs no extra length */
4728 POSESSIVE: /* Test for possessive quantifier */
4732 length += 2 + 2*LINK_SIZE; /* Allow for atomic brackets */
4736 /* An alternation contains an offset to the next branch or ket. If any ims
4737 options changed in the previous branch(es), and/or if we are in a
4738 lookbehind assertion, extra space will be needed at the start of the
4739 branch. This is handled by branch_extra. */
4742 length += 1 + LINK_SIZE + branch_extra;
4745 /* A character class uses 33 characters provided that all the character
4746 values are less than 256. Otherwise, it uses a bit map for low valued
4747 characters, and individual items for others. Don't worry about character
4748 types that aren't allowed in classes - they'll get picked up during the
4749 compile. A character class that contains only one single-byte character
4750 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
4751 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
4754 if (*(++ptr) == '^')
4756 class_optcount = 10; /* Greater than one */
4759 else class_optcount = 0;
4765 /* Written as a "do" so that an initial ']' is taken as data */
4769 /* Inside \Q...\E everything is literal except \E */
4773 if (*ptr != '\\' || ptr[1] != 'E') goto GET_ONE_CHARACTER;
4779 /* Outside \Q...\E, check for escapes */
4783 c = check_escape(&ptr, errorptr, bracount, options, TRUE);
4784 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4786 /* \b is backspace inside a class; \X is literal */
4788 if (-c == ESC_b) c = '\b';
4789 else if (-c == ESC_X) c = 'X';
4791 /* \Q enters quoting mode */
4793 else if (-c == ESC_Q)
4799 /* Handle escapes that turn into characters */
4801 if (c >= 0) goto NON_SPECIAL_CHARACTER;
4803 /* Escapes that are meta-things. The normal ones just affect the
4804 bit map, but Unicode properties require an XCLASS extended item. */
4808 class_optcount = 10; /* \d, \s etc; make sure > 1 */
4810 if (-c == ESC_p || -c == ESC_P)
4815 length += LINK_SIZE + 2;
4823 /* Check the syntax for POSIX stuff. The bits we actually handle are
4824 checked during the real compile phase. */
4826 else if (*ptr == '[' && check_posix_syntax(ptr, &ptr, &compile_block))
4829 class_optcount = 10; /* Make sure > 1 */
4832 /* Anything else increments the possible optimization count. We have to
4833 detect ranges here so that we can compute the number of extra ranges for
4834 caseless wide characters when UCP support is available. If there are wide
4835 characters, we are going to have to use an XCLASS, even for single
4848 GETCHARLEN(c, ptr, extra);
4856 /* Come here from handling \ above when it escapes to a char value */
4858 NON_SPECIAL_CHARACTER:
4864 uschar const *hyptr = ptr++;
4868 d = check_escape(&ptr, errorptr, bracount, options, TRUE);
4869 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
4870 if (-d == ESC_b) d = '\b'; /* backspace */
4871 else if (-d == ESC_X) d = 'X'; /* literal X in a class */
4873 else if (ptr[1] != 0 && ptr[1] != ']')
4880 GETCHARLEN(d, ptr, extra);
4887 if (d < 0) ptr = hyptr; /* go back to hyphen as data */
4890 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
4891 127 for caseless matching, we will need to use an XCLASS. */
4895 class_optcount = 10; /* Ensure > 1 */
4899 goto PCRE_ERROR_RETURN;
4903 if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4906 if (!class_utf8) /* Allow for XCLASS overhead */
4909 length += LINK_SIZE + 2;
4913 /* If we have UCP support, find out how many extra ranges are
4914 needed to map the other case of characters within this range. We
4915 have to mimic the range optimization here, because extending the
4916 range upwards might push d over a boundary that makes is use
4917 another byte in the UTF-8 representation. */
4919 if ((options & PCRE_CASELESS) != 0)
4924 while (get_othercase_range(&cc, origd, &occ, &ocd))
4926 if (occ >= c && ocd <= d) continue; /* Skip embedded */
4928 if (occ < c && ocd >= c - 1) /* Extend the basic range */
4929 { /* if there is overlap, */
4930 c = occ; /* noting that if occ < c */
4931 continue; /* we can't have ocd > d */
4932 } /* because a subrange is */
4933 if (ocd > d && occ <= d + 1) /* always shorter than */
4934 { /* the basic range. */
4939 /* An extra item is needed */
4941 length += 1 + ord2utf8(occ, buffer) +
4942 ((occ == ocd)? 0 : ord2utf8(ocd, buffer));
4945 #endif /* SUPPORT_UCP */
4947 /* The length of the (possibly extended) range */
4949 length += 1 + ord2utf8(c, buffer) + ord2utf8(d, buffer);
4951 #endif /* SUPPORT_UTF8 */
4955 /* We have a single character. There is nothing to be done unless we
4956 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
4957 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
4963 if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4966 class_optcount = 10; /* Ensure > 1 */
4967 if (!class_utf8) /* Allow for XCLASS overhead */
4970 length += LINK_SIZE + 2;
4973 length += (((options & PCRE_CASELESS) != 0)? 2 : 1) *
4974 (1 + ord2utf8(c, buffer));
4975 #else /* SUPPORT_UCP */
4976 length += 1 + ord2utf8(c, buffer);
4977 #endif /* SUPPORT_UCP */
4979 #endif /* SUPPORT_UTF8 */
4983 while (*(++ptr) != 0 && (inescq || *ptr != ']')); /* Concludes "do" above */
4985 if (*ptr == 0) /* Missing terminating ']' */
4988 goto PCRE_ERROR_RETURN;
4991 /* We can optimize when there was only one optimizable character. Repeats
4992 for positive and negated single one-byte chars are handled by the general
4993 code. Here, we handle repeats for the class opcodes. */
4995 if (class_optcount == 1) length += 3; else
4999 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
5000 we also need extra for wrapping the whole thing in a sub-pattern. */
5002 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2))
5004 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5005 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5006 if ((min == 0 && (max == 1 || max == -1)) ||
5007 (min == 1 && max == -1))
5013 length += 2 + 2*LINK_SIZE;
5015 else if (ptr[1] == '?') ptr++;
5020 /* Brackets may be genuine groups or special things */
5023 branch_newextra = 0;
5024 bracket_length = 1 + LINK_SIZE;
5026 /* Handle special forms of bracket, which all start (? */
5035 /* Skip over comments entirely */
5038 while (*ptr != 0 && *ptr != ')') ptr++;
5042 goto PCRE_ERROR_RETURN;
5046 /* Non-referencing groups and lookaheads just move the pointer on, and
5047 then behave like a non-special bracket, except that they don't increment
5048 the count of extracting brackets. Ditto for the "once only" bracket,
5049 which is in Perl from version 5.005. */
5058 /* (?R) specifies a recursive call to the regex, which is an extension
5059 to provide the facility which can be obtained by (?p{perl-code}) in
5060 Perl 5.6. In Perl 5.8 this has become (??{perl-code}).
5062 From PCRE 4.00, items such as (?3) specify subroutine-like "calls" to
5063 the appropriate numbered brackets. This includes both recursive and
5064 non-recursive calls. (?R) is now synonymous with (?0). */
5069 case '0': case '1': case '2': case '3': case '4':
5070 case '5': case '6': case '7': case '8': case '9':
5073 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5077 goto PCRE_ERROR_RETURN;
5079 length += 1 + LINK_SIZE;
5081 /* If this item is quantified, it will get wrapped inside brackets so
5082 as to use the code for quantified brackets. We jump down and use the
5083 code that handles this for real brackets. */
5085 if (ptr[1] == '+' || ptr[1] == '*' || ptr[1] == '?' || ptr[1] == '{')
5087 length += 2 + 2 * LINK_SIZE; /* to make bracketed */
5088 duplength = 5 + 3 * LINK_SIZE;
5089 goto HANDLE_QUANTIFIED_BRACKETS;
5093 /* (?C) is an extension which provides "callout" - to provide a bit of
5094 the functionality of the Perl (?{...}) feature. An optional number may
5095 follow (default is zero). */
5099 while ((digitab[*(++ptr)] & ctype_digit) != 0);
5103 goto PCRE_ERROR_RETURN;
5105 length += 2 + 2*LINK_SIZE;
5108 /* Named subpatterns are an extension copied from Python */
5114 const uschar *p; /* Don't amalgamate; some compilers */
5115 p = ++ptr; /* grumble at autoincrement in declaration */
5116 while ((compile_block.ctypes[*ptr] & ctype_word) != 0) ptr++;
5120 goto PCRE_ERROR_RETURN;
5123 if (ptr - p > max_name_size) max_name_size = (ptr - p);
5127 if (*ptr == '=' || *ptr == '>')
5129 while ((compile_block.ctypes[*(++ptr)] & ctype_word) != 0);
5133 goto PCRE_ERROR_RETURN;
5138 /* Unknown character after (?P */
5141 goto PCRE_ERROR_RETURN;
5143 /* Lookbehinds are in Perl from version 5.005 */
5147 if (*ptr == '=' || *ptr == '!')
5149 branch_newextra = 1 + LINK_SIZE;
5150 length += 1 + LINK_SIZE; /* For the first branch */
5154 goto PCRE_ERROR_RETURN;
5156 /* Conditionals are in Perl from version 5.005. The bracket must either
5157 be followed by a number (for bracket reference) or by an assertion
5158 group, or (a PCRE extension) by 'R' for a recursion test. */
5161 if (ptr[3] == 'R' && ptr[4] == ')')
5166 else if ((digitab[ptr[3]] & ctype_digit) != 0)
5170 while ((digitab[*ptr] & ctype_digit) != 0) ptr++;
5174 goto PCRE_ERROR_RETURN;
5177 else /* An assertion must follow */
5179 ptr++; /* Can treat like ':' as far as spacing is concerned */
5180 if (ptr[2] != '?' ||
5181 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
5183 ptr += 2; /* To get right offset in message */
5185 goto PCRE_ERROR_RETURN;
5190 /* Else loop checking valid options until ) is met. Anything else is an
5191 error. If we are without any brackets, i.e. at top level, the settings
5192 act as if specified in the options, so massage the options immediately.
5193 This is for backward compatibility with Perl 5.004. */
5206 *optset |= PCRE_CASELESS;
5210 *optset |= PCRE_MULTILINE;
5214 *optset |= PCRE_DOTALL;
5218 *optset |= PCRE_EXTENDED;
5222 *optset |= PCRE_EXTRA;
5226 *optset |= PCRE_UNGREEDY;
5233 /* A termination by ')' indicates an options-setting-only item; if
5234 this is at the very start of the pattern (indicated by item_count
5235 being zero), we use it to set the global options. This is helpful
5236 when analyzing the pattern for first characters, etc. Otherwise
5237 nothing is done here and it is handled during the compiling
5240 [Historical note: Up to Perl 5.8, options settings at top level
5241 were always global settings, wherever they appeared in the pattern.
5242 That is, they were equivalent to an external setting. From 5.8
5243 onwards, they apply only to what follows (which is what you might
5247 if (item_count == 0)
5249 options = (options | set) & (~unset);
5250 set = unset = 0; /* To save length */
5251 item_count--; /* To allow for several */
5256 /* A termination by ':' indicates the start of a nested group with
5257 the given options set. This is again handled at compile time, but
5258 we must allow for compiled space if any of the ims options are
5259 set. We also have to allow for resetting space at the end of
5260 the group, which is why 4 is added to the length and not just 2.
5261 If there are several changes of options within the same group, this
5262 will lead to an over-estimate on the length, but this shouldn't
5263 matter very much. We also have to allow for resetting options at
5264 the start of any alternations, which we do by setting
5265 branch_newextra to 2. Finally, we record whether the case-dependent
5266 flag ever changes within the regex. This is used by the "required
5270 if (((set|unset) & PCRE_IMS) != 0)
5273 branch_newextra = 2;
5274 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
5278 /* Unrecognized option character */
5282 goto PCRE_ERROR_RETURN;
5286 /* If we hit a closing bracket, that's it - this is a freestanding
5287 option-setting. We need to ensure that branch_extra is updated if
5288 necessary. The only values branch_newextra can have here are 0 or 2.
5289 If the value is 2, then branch_extra must either be 2 or 5, depending
5290 on whether this is a lookbehind group or not. */
5295 if (branch_newextra == 2 &&
5296 (branch_extra == 0 || branch_extra == 1+LINK_SIZE))
5297 branch_extra += branch_newextra;
5301 /* If options were terminated by ':' control comes here. Fall through
5302 to handle the group below. */
5306 /* Extracting brackets must be counted so we can process escapes in a
5307 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
5308 need an additional 3 bytes of store per extracting bracket. However, if
5309 PCRE_NO_AUTO)CAPTURE is set, unadorned brackets become non-capturing, so we
5310 must leave the count alone (it will aways be zero). */
5312 else if ((options & PCRE_NO_AUTO_CAPTURE) == 0)
5315 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
5318 /* Save length for computing whole length at end if there's a repeat that
5319 requires duplication of the group. Also save the current value of
5320 branch_extra, and start the new group with the new value. If non-zero, this
5321 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
5323 if (brastackptr >= sizeof(brastack)/sizeof(int))
5326 goto PCRE_ERROR_RETURN;
5329 bralenstack[brastackptr] = branch_extra;
5330 branch_extra = branch_newextra;
5332 brastack[brastackptr++] = length;
5333 length += bracket_length;
5336 /* Handle ket. Look for subsequent max/min; for certain sets of values we
5337 have to replicate this bracket up to that many times. If brastackptr is
5338 0 this is an unmatched bracket which will generate an error, but take care
5339 not to try to access brastack[-1] when computing the length and restoring
5340 the branch_extra value. */
5343 length += 1 + LINK_SIZE;
5344 if (brastackptr > 0)
5346 duplength = length - brastack[--brastackptr];
5347 branch_extra = bralenstack[brastackptr];
5351 /* The following code is also used when a recursion such as (?3) is
5352 followed by a quantifier, because in that case, it has to be wrapped inside
5353 brackets so that the quantifier works. The value of duplength must be
5354 set before arrival. */
5356 HANDLE_QUANTIFIED_BRACKETS:
5358 /* Leave ptr at the final char; for read_repeat_counts this happens
5359 automatically; for the others we need an increment. */
5361 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2))
5363 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr);
5364 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
5366 else if (c == '*') { min = 0; max = -1; ptr++; }
5367 else if (c == '+') { min = 1; max = -1; ptr++; }
5368 else if (c == '?') { min = 0; max = 1; ptr++; }
5369 else { min = 1; max = 1; }
5371 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
5372 group, and if the maximum is greater than zero, we have to replicate
5373 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
5379 if (max > 0) length += (max - 1) * (duplength + 3 + 2*LINK_SIZE);
5382 /* When the minimum is greater than zero, we have to replicate up to
5383 minval-1 times, with no additions required in the copies. Then, if there
5384 is a limited maximum we have to replicate up to maxval-1 times allowing
5385 for a BRAZERO item before each optional copy and nesting brackets for all
5386 but one of the optional copies. */
5390 length += (min - 1) * duplength;
5391 if (max > min) /* Need this test as max=-1 means no limit */
5392 length += (max - min) * (duplength + 3 + 2*LINK_SIZE)
5393 - (2 + 2*LINK_SIZE);
5396 /* Allow space for once brackets for "possessive quantifier" */
5401 length += 2 + 2*LINK_SIZE;
5405 /* Non-special character. It won't be space or # in extended mode, so it is
5406 always a genuine character. If we are in a \Q...\E sequence, check for the
5407 end; if not, we have a literal. */
5412 if (inescq && c == '\\' && ptr[1] == 'E')
5419 length += 2; /* For a one-byte character */
5420 lastitemlength = 1; /* Default length of last item for repeats */
5422 /* In UTF-8 mode, check for additional bytes. */
5425 if (utf8 && (c & 0xc0) == 0xc0)
5427 while ((ptr[1] & 0xc0) == 0x80) /* Can't flow over the end */
5428 { /* because the end is marked */
5429 lastitemlength++; /* by a zero byte. */
5440 length += 2 + LINK_SIZE; /* For final KET and END */
5442 if ((options & PCRE_AUTO_CALLOUT) != 0)
5443 length += 2 + 2*LINK_SIZE; /* For final callout */
5445 if (length > MAX_PATTERN_SIZE)
5451 /* Compute the size of data block needed and get it, either from malloc or
5452 externally provided function. */
5454 size = length + sizeof(real_pcre) + name_count * (max_name_size + 3);
5455 re = (real_pcre *)(pcre_malloc)(size);
5463 /* Put in the magic number, and save the sizes, options, and character table
5464 pointer. NULL is used for the default character tables. The nullpad field is at
5465 the end; it's there to help in the case when a regex compiled on a system with
5466 4-byte pointers is run on another with 8-byte pointers. */
5468 re->magic_number = MAGIC_NUMBER;
5470 re->options = options;
5471 re->dummy1 = re->dummy2 = 0;
5472 re->name_table_offset = sizeof(real_pcre);
5473 re->name_entry_size = max_name_size + 3;
5474 re->name_count = name_count;
5475 re->tables = (tables == pcre_default_tables)? NULL : tables;
5478 /* The starting points of the name/number translation table and of the code are
5479 passed around in the compile data block. */
5481 compile_block.names_found = 0;
5482 compile_block.name_entry_size = max_name_size + 3;
5483 compile_block.name_table = (uschar *)re + re->name_table_offset;
5484 codestart = compile_block.name_table + re->name_entry_size * re->name_count;
5485 compile_block.start_code = codestart;
5486 compile_block.start_pattern = (const uschar *)pattern;
5487 compile_block.req_varyopt = 0;
5488 compile_block.nopartial = FALSE;
5490 /* Set up a starting, non-extracting bracket, then compile the expression. On
5491 error, *errorptr will be set non-NULL, so we don't need to look at the result
5492 of the function here. */
5494 ptr = (const uschar *)pattern;
5495 code = (uschar *)codestart;
5498 (void)compile_regex(options, options & PCRE_IMS, &bracount, &code, &ptr,
5499 errorptr, FALSE, 0, &firstbyte, &reqbyte, NULL, &compile_block);
5500 re->top_bracket = bracount;
5501 re->top_backref = compile_block.top_backref;
5503 if (compile_block.nopartial) re->options |= PCRE_NOPARTIAL;
5505 /* If not reached end of pattern on success, there's an excess bracket. */
5507 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
5509 /* Fill in the terminating state and check for disastrous overflow, but
5510 if debugging, leave the test till after things are printed out. */
5515 if (code - codestart > length) *errorptr = ERR23;
5518 /* Give an error if there's back reference to a non-existent capturing
5521 if (re->top_backref > re->top_bracket) *errorptr = ERR15;
5523 /* Failed to compile, or error while post-processing */
5525 if (*errorptr != NULL)
5529 *erroroffset = ptr - (const uschar *)pattern;
5533 /* If the anchored option was not passed, set the flag if we can determine that
5534 the pattern is anchored by virtue of ^ characters or \A or anything else (such
5535 as starting with .* when DOTALL is set).
5537 Otherwise, if we know what the first character has to be, save it, because that
5538 speeds up unanchored matches no end. If not, see if we can set the
5539 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
5540 start with ^. and also when all branches start with .* for non-DOTALL matches.
5543 if ((options & PCRE_ANCHORED) == 0)
5545 int temp_options = options;
5546 if (is_anchored(codestart, &temp_options, 0, compile_block.backref_map))
5547 re->options |= PCRE_ANCHORED;
5551 firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
5552 if (firstbyte >= 0) /* Remove caseless flag for non-caseable chars */
5554 int ch = firstbyte & 255;
5555 re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
5556 compile_block.fcc[ch] == ch)? ch : firstbyte;
5557 re->options |= PCRE_FIRSTSET;
5559 else if (is_startline(codestart, 0, compile_block.backref_map))
5560 re->options |= PCRE_STARTLINE;
5564 /* For an anchored pattern, we use the "required byte" only if it follows a
5565 variable length item in the regex. Remove the caseless flag for non-caseable
5569 ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
5571 int ch = reqbyte & 255;
5572 re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
5573 compile_block.fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
5574 re->options |= PCRE_REQCHSET;
5577 /* Print out the compiled data for debugging */
5581 printf("Length = %d top_bracket = %d top_backref = %d\n",
5582 length, re->top_bracket, re->top_backref);
5584 if (re->options != 0)
5586 printf("%s%s%s%s%s%s%s%s%s%s\n",
5587 ((re->options & PCRE_NOPARTIAL) != 0)? "nopartial " : "",
5588 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
5589 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
5590 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
5591 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
5592 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
5593 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
5594 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
5595 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
5596 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
5599 if ((re->options & PCRE_FIRSTSET) != 0)
5601 int ch = re->first_byte & 255;
5602 const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5603 if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
5604 else printf("First char = \\x%02x%s\n", ch, caseless);
5607 if ((re->options & PCRE_REQCHSET) != 0)
5609 int ch = re->req_byte & 255;
5610 const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)? "" : " (caseless)";
5611 if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
5612 else printf("Req char = \\x%02x%s\n", ch, caseless);
5615 print_internals(re, stdout);
5617 /* This check is done here in the debugging case so that the code that
5618 was compiled can be seen. */
5620 if (code - codestart > length)
5624 *erroroffset = ptr - (uschar *)pattern;
5634 /*************************************************
5635 * Match a back-reference *
5636 *************************************************/
5638 /* If a back reference hasn't been set, the length that is passed is greater
5639 than the number of characters left in the string, so the match fails.
5642 offset index into the offset vector
5643 eptr points into the subject
5644 length length to be matched
5645 md points to match data block
5648 Returns: TRUE if matched
5652 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
5653 unsigned long int ims)
5655 const uschar *p = md->start_subject + md->offset_vector[offset];
5658 if (eptr >= md->end_subject)
5659 printf("matching subject <null>");
5662 printf("matching subject ");
5663 pchars(eptr, length, TRUE, md);
5665 printf(" against backref ");
5666 pchars(p, length, FALSE, md);
5670 /* Always fail if not enough characters left */
5672 if (length > md->end_subject - eptr) return FALSE;
5674 /* Separate the caselesss case for speed */
5676 if ((ims & PCRE_CASELESS) != 0)
5678 while (length-- > 0)
5679 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
5682 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
5689 /*************************************************
5690 * Match character against an XCLASS *
5691 *************************************************/
5693 /* This function is called from within the XCLASS code below, to match a
5694 character against an extended class which might match values > 255.
5698 data points to the flag byte of the XCLASS data
5700 Returns: TRUE if character matches, else FALSE
5704 match_xclass(int c, const uschar *data)
5707 BOOL negated = (*data & XCL_NOT) != 0;
5709 /* Character values < 256 are matched against a bitmap, if one is present. If
5710 not, we still carry on, because there may be ranges that start below 256 in the
5715 if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
5716 return !negated; /* char found */
5719 /* First skip the bit map if present. Then match against the list of Unicode
5720 properties or large chars or ranges that end with a large char. We won't ever
5721 encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
5723 if ((*data++ & XCL_MAP) != 0) data += 32;
5725 while ((t = *data++) != XCL_END)
5728 if (t == XCL_SINGLE)
5730 GETCHARINC(x, data);
5731 if (c == x) return !negated;
5733 else if (t == XCL_RANGE)
5735 GETCHARINC(x, data);
5736 GETCHARINC(y, data);
5737 if (c >= x && c <= y) return !negated;
5741 else /* XCL_PROP & XCL_NOTPROP */
5743 int chartype, othercase;
5744 int rqdtype = *data++;
5745 int category = ucp_findchar(c, &chartype, &othercase);
5748 if ((rqdtype - 128 == category) == (t == XCL_PROP)) return !negated;
5752 if ((rqdtype == chartype) == (t == XCL_PROP)) return !negated;
5755 #endif /* SUPPORT_UCP */
5758 return negated; /* char did not match */
5763 /***************************************************************************
5764 ****************************************************************************
5765 RECURSION IN THE match() FUNCTION
5767 The match() function is highly recursive. Some regular expressions can cause
5768 it to recurse thousands of times. I was writing for Unix, so I just let it
5769 call itself recursively. This uses the stack for saving everything that has
5770 to be saved for a recursive call. On Unix, the stack can be large, and this
5773 It turns out that on non-Unix systems there are problems with programs that
5774 use a lot of stack. (This despite the fact that every last chip has oodles
5775 of memory these days, and techniques for extending the stack have been known
5776 for decades.) So....
5778 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
5779 calls by keeping local variables that need to be preserved in blocks of memory
5780 obtained from malloc instead instead of on the stack. Macros are used to
5781 achieve this so that the actual code doesn't look very different to what it
5783 ****************************************************************************
5784 ***************************************************************************/
5787 /* These versions of the macros use the stack, as normal */
5790 #define REGISTER register
5791 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
5792 #define RRETURN(ra) return ra
5796 /* These versions of the macros manage a private stack on the heap. Note
5797 that the rd argument of RMATCH isn't actually used. It's the md argument of
5798 match(), which never changes. */
5802 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
5804 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
5805 if (setjmp(frame->Xwhere) == 0)\
5807 newframe->Xeptr = ra;\
5808 newframe->Xecode = rb;\
5809 newframe->Xoffset_top = rc;\
5810 newframe->Xims = re;\
5811 newframe->Xeptrb = rf;\
5812 newframe->Xflags = rg;\
5813 newframe->Xprevframe = frame;\
5815 DPRINTF(("restarting from line %d\n", __LINE__));\
5820 DPRINTF(("longjumped back to line %d\n", __LINE__));\
5821 frame = md->thisframe;\
5822 rx = frame->Xresult;\
5826 #define RRETURN(ra)\
5828 heapframe *newframe = frame;\
5829 frame = newframe->Xprevframe;\
5830 (pcre_stack_free)(newframe);\
5833 frame->Xresult = ra;\
5834 md->thisframe = frame;\
5835 longjmp(frame->Xwhere, 1);\
5841 /* Structure for remembering the local variables in a private frame */
5843 typedef struct heapframe {
5844 struct heapframe *Xprevframe;
5846 /* Function arguments that may change */
5848 const uschar *Xeptr;
5849 const uschar *Xecode;
5855 /* Function local variables */
5857 const uschar *Xcallpat;
5858 const uschar *Xcharptr;
5859 const uschar *Xdata;
5860 const uschar *Xnext;
5862 const uschar *Xprev;
5863 const uschar *Xsaved_eptr;
5865 recursion_info Xnew_recursive;
5872 unsigned long int Xoriginal_ims;
5876 int Xprop_fail_result;
5879 int Xprop_othercase;
5880 int Xprop_test_against;
5881 int *Xprop_test_variable;
5893 int Xsave_capture_last;
5894 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
5895 int Xstacksave[REC_STACK_SAVE_MAX];
5899 /* Place to pass back result, and where to jump back to */
5909 /***************************************************************************
5910 ***************************************************************************/
5914 /*************************************************
5915 * Match from current position *
5916 *************************************************/
5918 /* On entry ecode points to the first opcode, and eptr to the first character
5919 in the subject string, while eptrb holds the value of eptr at the start of the
5920 last bracketed group - used for breaking infinite loops matching zero-length
5921 strings. This function is called recursively in many circumstances. Whenever it
5922 returns a negative (error) response, the outer incarnation must also return the
5925 Performance note: It might be tempting to extract commonly used fields from the
5926 md structure (e.g. utf8, end_subject) into individual variables to improve
5927 performance. Tests using gcc on a SPARC disproved this; in the first case, it
5928 made performance worse.
5931 eptr pointer in subject
5932 ecode position in code
5933 offset_top current top pointer
5934 md pointer to "static" info for the match
5935 ims current /i, /m, and /s options
5936 eptrb pointer to chain of blocks containing eptr at start of
5937 brackets - for testing for empty matches
5939 match_condassert - this is an assertion condition
5940 match_isgroup - this is the start of a bracketed group
5942 Returns: MATCH_MATCH if matched ) these values are >= 0
5943 MATCH_NOMATCH if failed to match )
5944 a negative PCRE_ERROR_xxx value if aborted by an error condition
5945 (e.g. stopped by recursion limit)
5949 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
5950 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
5953 /* These variables do not need to be preserved over recursion in this function,
5954 so they can be ordinary variables in all cases. Mark them with "register"
5955 because they are used a lot in loops. */
5957 register int rrc; /* Returns from recursive calls */
5958 register int i; /* Used for loops not involving calls to RMATCH() */
5959 register int c; /* Character values not kept over RMATCH() calls */
5961 /* When recursion is not being used, all "local" variables that have to be
5962 preserved over calls to RMATCH() are part of a "frame" which is obtained from
5963 heap storage. Set up the top-level frame here; others are obtained from the
5964 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
5967 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
5968 frame->Xprevframe = NULL; /* Marks the top level */
5970 /* Copy in the original argument variables */
5972 frame->Xeptr = eptr;
5973 frame->Xecode = ecode;
5974 frame->Xoffset_top = offset_top;
5976 frame->Xeptrb = eptrb;
5977 frame->Xflags = flags;
5979 /* This is where control jumps back to to effect "recursion" */
5983 /* Macros make the argument variables come from the current frame */
5985 #define eptr frame->Xeptr
5986 #define ecode frame->Xecode
5987 #define offset_top frame->Xoffset_top
5988 #define ims frame->Xims
5989 #define eptrb frame->Xeptrb
5990 #define flags frame->Xflags
5992 /* Ditto for the local variables */
5995 #define charptr frame->Xcharptr
5997 #define callpat frame->Xcallpat
5998 #define data frame->Xdata
5999 #define next frame->Xnext
6000 #define pp frame->Xpp
6001 #define prev frame->Xprev
6002 #define saved_eptr frame->Xsaved_eptr
6004 #define new_recursive frame->Xnew_recursive
6006 #define cur_is_word frame->Xcur_is_word
6007 #define condition frame->Xcondition
6008 #define minimize frame->Xminimize
6009 #define prev_is_word frame->Xprev_is_word
6011 #define original_ims frame->Xoriginal_ims
6014 #define prop_type frame->Xprop_type
6015 #define prop_fail_result frame->Xprop_fail_result
6016 #define prop_category frame->Xprop_category
6017 #define prop_chartype frame->Xprop_chartype
6018 #define prop_othercase frame->Xprop_othercase
6019 #define prop_test_against frame->Xprop_test_against
6020 #define prop_test_variable frame->Xprop_test_variable
6023 #define ctype frame->Xctype
6024 #define fc frame->Xfc
6025 #define fi frame->Xfi
6026 #define length frame->Xlength
6027 #define max frame->Xmax
6028 #define min frame->Xmin
6029 #define number frame->Xnumber
6030 #define offset frame->Xoffset
6031 #define op frame->Xop
6032 #define save_capture_last frame->Xsave_capture_last
6033 #define save_offset1 frame->Xsave_offset1
6034 #define save_offset2 frame->Xsave_offset2
6035 #define save_offset3 frame->Xsave_offset3
6036 #define stacksave frame->Xstacksave
6038 #define newptrb frame->Xnewptrb
6040 /* When recursion is being used, local variables are allocated on the stack and
6041 get preserved during recursion in the normal way. In this environment, fi and
6042 i, and fc and c, can be the same variables. */
6049 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
6050 const uschar *charptr; /* small blocks of the code. My normal */
6051 #endif /* style of coding would have declared */
6052 const uschar *callpat; /* them within each of those blocks. */
6053 const uschar *data; /* However, in order to accommodate the */
6054 const uschar *next; /* version of this code that uses an */
6055 const uschar *pp; /* external "stack" implemented on the */
6056 const uschar *prev; /* heap, it is easier to declare them */
6057 const uschar *saved_eptr; /* all here, so the declarations can */
6058 /* be cut out in a block. The only */
6059 recursion_info new_recursive; /* declarations within blocks below are */
6060 /* for variables that do not have to */
6061 BOOL cur_is_word; /* be preserved over a recursive call */
6062 BOOL condition; /* to RMATCH(). */
6066 unsigned long int original_ims;
6070 int prop_fail_result;
6074 int prop_test_against;
6075 int *prop_test_variable;
6085 int save_capture_last;
6086 int save_offset1, save_offset2, save_offset3;
6087 int stacksave[REC_STACK_SAVE_MAX];
6092 /* These statements are here to stop the compiler complaining about unitialized
6096 prop_fail_result = 0;
6097 prop_test_against = 0;
6098 prop_test_variable = NULL;
6101 /* OK, now we can get on with the real code of the function. Recursion is
6102 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
6103 these just turn into a recursive call to match() and a "return", respectively.
6104 However, RMATCH isn't like a function call because it's quite a complicated
6105 macro. It has to be used in one particular way. This shouldn't, however, impact
6106 performance when true recursion is being used. */
6108 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
6110 original_ims = ims; /* Save for resetting on ')' */
6112 /* At the start of a bracketed group, add the current subject pointer to the
6113 stack of such pointers, to be re-instated at the end of the group when we hit
6114 the closing ket. When match() is called in other circumstances, we don't add to
6117 if ((flags & match_isgroup) != 0)
6119 newptrb.epb_prev = eptrb;
6120 newptrb.epb_saved_eptr = eptr;
6124 /* Now start processing the operations. */
6131 /* For partial matching, remember if we ever hit the end of the subject after
6132 matching at least one subject character. */
6135 eptr >= md->end_subject &&
6136 eptr > md->start_match)
6139 /* Opening capturing bracket. If there is space in the offset vector, save
6140 the current subject position in the working slot at the top of the vector. We
6141 mustn't change the current values of the data slot, because they may be set
6142 from a previous iteration of this group, and be referred to by a reference
6145 If the bracket fails to match, we need to restore this value and also the
6146 values of the final offsets, in case they were set by a previous iteration of
6149 If there isn't enough space in the offset vector, treat this as if it were a
6150 non-capturing bracket. Don't worry about setting the flag for the error case
6151 here; that is handled in the code for KET. */
6155 number = op - OP_BRA;
6157 /* For extended extraction brackets (large number), we have to fish out the
6158 number from a dummy opcode at the start. */
6160 if (number > EXTRACT_BASIC_MAX)
6161 number = GET2(ecode, 2+LINK_SIZE);
6162 offset = number << 1;
6165 printf("start bracket %d subject=", number);
6166 pchars(eptr, 16, TRUE, md);
6170 if (offset < md->offset_max)
6172 save_offset1 = md->offset_vector[offset];
6173 save_offset2 = md->offset_vector[offset+1];
6174 save_offset3 = md->offset_vector[md->offset_end - number];
6175 save_capture_last = md->capture_last;
6177 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
6178 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
6182 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6184 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6185 md->capture_last = save_capture_last;
6186 ecode += GET(ecode, 1);
6188 while (*ecode == OP_ALT);
6190 DPRINTF(("bracket %d failed\n", number));
6192 md->offset_vector[offset] = save_offset1;
6193 md->offset_vector[offset+1] = save_offset2;
6194 md->offset_vector[md->offset_end - number] = save_offset3;
6196 RRETURN(MATCH_NOMATCH);
6199 /* Insufficient room for saving captured contents */
6204 /* Other types of node can be handled by a switch */
6208 case OP_BRA: /* Non-capturing bracket: optimized */
6209 DPRINTF(("start bracket 0\n"));
6212 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6214 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6215 ecode += GET(ecode, 1);
6217 while (*ecode == OP_ALT);
6218 DPRINTF(("bracket 0 failed\n"));
6219 RRETURN(MATCH_NOMATCH);
6221 /* Conditional group: compilation checked that there are no more than
6222 two branches. If the condition is false, skipping the first branch takes us
6223 past the end if there is only one branch, but that's OK because that is
6224 exactly what going to the ket would do. */
6227 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
6229 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
6230 condition = (offset == CREF_RECURSE * 2)?
6231 (md->recursive != NULL) :
6232 (offset < offset_top && md->offset_vector[offset] >= 0);
6233 RMATCH(rrc, eptr, ecode + (condition?
6234 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
6235 offset_top, md, ims, eptrb, match_isgroup);
6239 /* The condition is an assertion. Call match() to evaluate it - setting
6240 the final argument TRUE causes it to stop at the end of an assertion. */
6244 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6245 match_condassert | match_isgroup);
6246 if (rrc == MATCH_MATCH)
6248 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
6249 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
6251 else if (rrc != MATCH_NOMATCH)
6253 RRETURN(rrc); /* Need braces because of following else */
6255 else ecode += GET(ecode, 1);
6256 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
6260 /* Control never reaches here */
6262 /* Skip over conditional reference or large extraction number data if
6270 /* End of the pattern. If we are in a recursion, we should restore the
6271 offsets appropriately and continue from after the call. */
6274 if (md->recursive != NULL && md->recursive->group_num == 0)
6276 recursion_info *rec = md->recursive;
6277 DPRINTF(("Hit the end in a (?0) recursion\n"));
6278 md->recursive = rec->prevrec;
6279 memmove(md->offset_vector, rec->offset_save,
6280 rec->saved_max * sizeof(int));
6281 md->start_match = rec->save_start;
6283 ecode = rec->after_call;
6287 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
6288 string - backtracking will then try other alternatives, if any. */
6290 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
6291 md->end_match_ptr = eptr; /* Record where we ended */
6292 md->end_offset_top = offset_top; /* and how many extracts were taken */
6293 RRETURN(MATCH_MATCH);
6295 /* Change option settings */
6300 DPRINTF(("ims set to %02lx\n", ims));
6303 /* Assertion brackets. Check the alternative branches in turn - the
6304 matching won't pass the KET for an assertion. If any one branch matches,
6305 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
6306 start of each branch to move the current point backwards, so the code at
6307 this level is identical to the lookahead case. */
6313 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6315 if (rrc == MATCH_MATCH) break;
6316 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6317 ecode += GET(ecode, 1);
6319 while (*ecode == OP_ALT);
6320 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
6322 /* If checking an assertion for a condition, return MATCH_MATCH. */
6324 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6326 /* Continue from after the assertion, updating the offsets high water
6327 mark, since extracts may have been taken during the assertion. */
6329 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6330 ecode += 1 + LINK_SIZE;
6331 offset_top = md->end_offset_top;
6334 /* Negative assertion: all branches must fail to match */
6337 case OP_ASSERTBACK_NOT:
6340 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
6342 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
6343 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6344 ecode += GET(ecode,1);
6346 while (*ecode == OP_ALT);
6348 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
6350 ecode += 1 + LINK_SIZE;
6353 /* Move the subject pointer back. This occurs only at the start of
6354 each branch of a lookbehind assertion. If we are too close to the start to
6355 move back, this match function fails. When working with UTF-8 we move
6356 back a number of characters, not bytes. */
6363 for (i = 0; i < c; i++)
6366 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6373 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
6376 eptr -= GET(ecode,1);
6377 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
6380 /* Skip to next op code */
6382 ecode += 1 + LINK_SIZE;
6385 /* The callout item calls an external function, if one is provided, passing
6386 details of the match so far. This is mainly for debugging, though the
6387 function is able to force a failure. */
6390 if (pcre_callout != NULL)
6392 pcre_callout_block cb;
6393 cb.version = 1; /* Version 1 of the callout block */
6394 cb.callout_number = ecode[1];
6395 cb.offset_vector = md->offset_vector;
6396 cb.subject = (const char *)md->start_subject;
6397 cb.subject_length = md->end_subject - md->start_subject;
6398 cb.start_match = md->start_match - md->start_subject;
6399 cb.current_position = eptr - md->start_subject;
6400 cb.pattern_position = GET(ecode, 2);
6401 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
6402 cb.capture_top = offset_top/2;
6403 cb.capture_last = md->capture_last;
6404 cb.callout_data = md->callout_data;
6405 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
6406 if (rrc < 0) RRETURN(rrc);
6408 ecode += 2 + 2*LINK_SIZE;
6411 /* Recursion either matches the current regex, or some subexpression. The
6412 offset data is the offset to the starting bracket from the start of the
6413 whole pattern. (This is so that it works from duplicated subpatterns.)
6415 If there are any capturing brackets started but not finished, we have to
6416 save their starting points and reinstate them after the recursion. However,
6417 we don't know how many such there are (offset_top records the completed
6418 total) so we just have to save all the potential data. There may be up to
6419 65535 such values, which is too large to put on the stack, but using malloc
6420 for small numbers seems expensive. As a compromise, the stack is used when
6421 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
6422 is used. A problem is what to do if the malloc fails ... there is no way of
6423 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
6424 values on the stack, and accept that the rest may be wrong.
6426 There are also other values that have to be saved. We use a chained
6427 sequence of blocks that actually live on the stack. Thanks to Robin Houston
6428 for the original version of this logic. */
6432 callpat = md->start_code + GET(ecode, 1);
6433 new_recursive.group_num = *callpat - OP_BRA;
6435 /* For extended extraction brackets (large number), we have to fish out
6436 the number from a dummy opcode at the start. */
6438 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
6439 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
6441 /* Add to "recursing stack" */
6443 new_recursive.prevrec = md->recursive;
6444 md->recursive = &new_recursive;
6446 /* Find where to continue from afterwards */
6448 ecode += 1 + LINK_SIZE;
6449 new_recursive.after_call = ecode;
6451 /* Now save the offset data. */
6453 new_recursive.saved_max = md->offset_end;
6454 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
6455 new_recursive.offset_save = stacksave;
6458 new_recursive.offset_save =
6459 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
6460 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
6463 memcpy(new_recursive.offset_save, md->offset_vector,
6464 new_recursive.saved_max * sizeof(int));
6465 new_recursive.save_start = md->start_match;
6466 md->start_match = eptr;
6468 /* OK, now we can do the recursion. For each top-level alternative we
6469 restore the offset and recursion data. */
6471 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
6474 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
6475 eptrb, match_isgroup);
6476 if (rrc == MATCH_MATCH)
6478 md->recursive = new_recursive.prevrec;
6479 if (new_recursive.offset_save != stacksave)
6480 (pcre_free)(new_recursive.offset_save);
6481 RRETURN(MATCH_MATCH);
6483 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6485 md->recursive = &new_recursive;
6486 memcpy(md->offset_vector, new_recursive.offset_save,
6487 new_recursive.saved_max * sizeof(int));
6488 callpat += GET(callpat, 1);
6490 while (*callpat == OP_ALT);
6492 DPRINTF(("Recursion didn't match\n"));
6493 md->recursive = new_recursive.prevrec;
6494 if (new_recursive.offset_save != stacksave)
6495 (pcre_free)(new_recursive.offset_save);
6496 RRETURN(MATCH_NOMATCH);
6498 /* Control never reaches here */
6500 /* "Once" brackets are like assertion brackets except that after a match,
6501 the point in the subject string is not moved back. Thus there can never be
6502 a move back into the brackets. Friedl calls these "atomic" subpatterns.
6503 Check the alternative branches in turn - the matching won't pass the KET
6504 for this kind of subpattern. If any one branch matches, we carry on as at
6505 the end of a normal bracket, leaving the subject pointer. */
6514 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
6515 eptrb, match_isgroup);
6516 if (rrc == MATCH_MATCH) break;
6517 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6518 ecode += GET(ecode,1);
6520 while (*ecode == OP_ALT);
6522 /* If hit the end of the group (which could be repeated), fail */
6524 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
6526 /* Continue as from after the assertion, updating the offsets high water
6527 mark, since extracts may have been taken. */
6529 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6531 offset_top = md->end_offset_top;
6532 eptr = md->end_match_ptr;
6534 /* For a non-repeating ket, just continue at this level. This also
6535 happens for a repeating ket if no characters were matched in the group.
6536 This is the forcible breaking of infinite loops as implemented in Perl
6537 5.005. If there is an options reset, it will get obeyed in the normal
6538 course of events. */
6540 if (*ecode == OP_KET || eptr == saved_eptr)
6542 ecode += 1+LINK_SIZE;
6546 /* The repeating kets try the rest of the pattern or restart from the
6547 preceding bracket, in the appropriate order. We need to reset any options
6548 that changed within the bracket before re-running it, so check the next
6551 if (ecode[1+LINK_SIZE] == OP_OPT)
6553 ims = (ims & ~PCRE_IMS) | ecode[4];
6554 DPRINTF(("ims set to %02lx at group repeat\n", ims));
6557 if (*ecode == OP_KETRMIN)
6559 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
6560 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6561 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6562 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6564 else /* OP_KETRMAX */
6566 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6567 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6568 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6569 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6572 RRETURN(MATCH_NOMATCH);
6574 /* An alternation is the end of a branch; scan along to find the end of the
6575 bracketed group and go to there. */
6578 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
6581 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
6582 that it may occur zero times. It may repeat infinitely, or not at all -
6583 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
6584 repeat limits are compiled as a number of copies, with the optional ones
6585 preceded by BRAZERO or BRAMINZERO. */
6590 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
6591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6592 do next += GET(next,1); while (*next == OP_ALT);
6593 ecode = next + 1+LINK_SIZE;
6600 do next += GET(next,1); while (*next == OP_ALT);
6601 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
6603 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6608 /* End of a group, repeated or non-repeating. If we are at the end of
6609 an assertion "group", stop matching and return MATCH_MATCH, but record the
6610 current high water mark for use by positive assertions. Do this also
6611 for the "once" (not-backup up) groups. */
6617 prev = ecode - GET(ecode, 1);
6618 saved_eptr = eptrb->epb_saved_eptr;
6620 /* Back up the stack of bracket start pointers. */
6622 eptrb = eptrb->epb_prev;
6624 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
6625 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
6628 md->end_match_ptr = eptr; /* For ONCE */
6629 md->end_offset_top = offset_top;
6630 RRETURN(MATCH_MATCH);
6633 /* In all other cases except a conditional group we have to check the
6634 group number back at the start and if necessary complete handling an
6635 extraction by setting the offsets and bumping the high water mark. */
6637 if (*prev != OP_COND)
6639 number = *prev - OP_BRA;
6641 /* For extended extraction brackets (large number), we have to fish out
6642 the number from a dummy opcode at the start. */
6644 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
6645 offset = number << 1;
6648 printf("end bracket %d", number);
6652 /* Test for a numbered group. This includes groups called as a result
6653 of recursion. Note that whole-pattern recursion is coded as a recurse
6654 into group 0, so it won't be picked up here. Instead, we catch it when
6655 the OP_END is reached. */
6659 md->capture_last = number;
6660 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
6662 md->offset_vector[offset] =
6663 md->offset_vector[md->offset_end - number];
6664 md->offset_vector[offset+1] = eptr - md->start_subject;
6665 if (offset_top <= offset) offset_top = offset + 2;
6668 /* Handle a recursively called group. Restore the offsets
6669 appropriately and continue from after the call. */
6671 if (md->recursive != NULL && md->recursive->group_num == number)
6673 recursion_info *rec = md->recursive;
6674 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
6675 md->recursive = rec->prevrec;
6676 md->start_match = rec->save_start;
6677 memcpy(md->offset_vector, rec->offset_save,
6678 rec->saved_max * sizeof(int));
6679 ecode = rec->after_call;
6686 /* Reset the value of the ims flags, in case they got changed during
6690 DPRINTF(("ims reset to %02lx\n", ims));
6692 /* For a non-repeating ket, just continue at this level. This also
6693 happens for a repeating ket if no characters were matched in the group.
6694 This is the forcible breaking of infinite loops as implemented in Perl
6695 5.005. If there is an options reset, it will get obeyed in the normal
6696 course of events. */
6698 if (*ecode == OP_KET || eptr == saved_eptr)
6700 ecode += 1 + LINK_SIZE;
6704 /* The repeating kets try the rest of the pattern or restart from the
6705 preceding bracket, in the appropriate order. */
6707 if (*ecode == OP_KETRMIN)
6709 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6711 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6712 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6714 else /* OP_KETRMAX */
6716 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
6717 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6718 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
6719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6723 RRETURN(MATCH_NOMATCH);
6725 /* Start of subject unless notbol, or after internal newline if multiline */
6728 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
6729 if ((ims & PCRE_MULTILINE) != 0)
6731 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
6732 RRETURN(MATCH_NOMATCH);
6736 /* ... else fall through */
6738 /* Start of subject assertion */
6741 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
6745 /* Start of match assertion */
6748 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
6752 /* Assert before internal newline if multiline, or before a terminating
6753 newline unless endonly is set, else end of subject unless noteol is set. */
6756 if ((ims & PCRE_MULTILINE) != 0)
6758 if (eptr < md->end_subject)
6759 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
6761 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
6767 if (md->noteol) RRETURN(MATCH_NOMATCH);
6770 if (eptr < md->end_subject - 1 ||
6771 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
6772 RRETURN(MATCH_NOMATCH);
6777 /* ... else fall through */
6779 /* End of subject assertion (\z) */
6782 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
6786 /* End of subject or ending \n assertion (\Z) */
6789 if (eptr < md->end_subject - 1 ||
6790 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
6794 /* Word boundary assertions */
6796 case OP_NOT_WORD_BOUNDARY:
6797 case OP_WORD_BOUNDARY:
6800 /* Find out if the previous and current characters are "word" characters.
6801 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
6802 be "non-word" characters. */
6807 if (eptr == md->start_subject) prev_is_word = FALSE; else
6809 const uschar *lastptr = eptr - 1;
6810 while((*lastptr & 0xc0) == 0x80) lastptr--;
6811 GETCHAR(c, lastptr);
6812 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6814 if (eptr >= md->end_subject) cur_is_word = FALSE; else
6817 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
6823 /* More streamlined when not in UTF-8 mode */
6826 prev_is_word = (eptr != md->start_subject) &&
6827 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
6828 cur_is_word = (eptr < md->end_subject) &&
6829 ((md->ctypes[*eptr] & ctype_word) != 0);
6832 /* Now see if the situation is what we want */
6834 if ((*ecode++ == OP_WORD_BOUNDARY)?
6835 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
6836 RRETURN(MATCH_NOMATCH);
6840 /* Match a single character type; inline for speed */
6843 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
6844 RRETURN(MATCH_NOMATCH);
6845 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6848 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
6853 /* Match a single byte, even in UTF-8 mode. This opcode really does match
6854 any byte, even newline, independent of the setting of PCRE_DOTALL. */
6857 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
6862 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6863 GETCHARINCTEST(c, eptr);
6868 (md->ctypes[c] & ctype_digit) != 0
6870 RRETURN(MATCH_NOMATCH);
6875 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6876 GETCHARINCTEST(c, eptr);
6881 (md->ctypes[c] & ctype_digit) == 0
6883 RRETURN(MATCH_NOMATCH);
6887 case OP_NOT_WHITESPACE:
6888 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6889 GETCHARINCTEST(c, eptr);
6894 (md->ctypes[c] & ctype_space) != 0
6896 RRETURN(MATCH_NOMATCH);
6901 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6902 GETCHARINCTEST(c, eptr);
6907 (md->ctypes[c] & ctype_space) == 0
6909 RRETURN(MATCH_NOMATCH);
6913 case OP_NOT_WORDCHAR:
6914 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6915 GETCHARINCTEST(c, eptr);
6920 (md->ctypes[c] & ctype_word) != 0
6922 RRETURN(MATCH_NOMATCH);
6927 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6928 GETCHARINCTEST(c, eptr);
6933 (md->ctypes[c] & ctype_word) == 0
6935 RRETURN(MATCH_NOMATCH);
6940 /* Check the next character by Unicode property. We will get here only
6941 if the support is in the binary; otherwise a compile-time error occurs. */
6945 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6946 GETCHARINCTEST(c, eptr);
6948 int chartype, rqdtype;
6950 int category = ucp_findchar(c, &chartype, &othercase);
6952 rqdtype = *(++ecode);
6957 if ((rqdtype - 128 != category) == (op == OP_PROP))
6958 RRETURN(MATCH_NOMATCH);
6962 if ((rqdtype != chartype) == (op == OP_PROP))
6963 RRETURN(MATCH_NOMATCH);
6968 /* Match an extended Unicode sequence. We will get here only if the support
6969 is in the binary; otherwise a compile-time error occurs. */
6972 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
6973 GETCHARINCTEST(c, eptr);
6977 int category = ucp_findchar(c, &chartype, &othercase);
6978 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
6979 while (eptr < md->end_subject)
6982 if (!md->utf8) c = *eptr; else
6984 GETCHARLEN(c, eptr, len);
6986 category = ucp_findchar(c, &chartype, &othercase);
6987 if (category != ucp_M) break;
6996 /* Match a back reference, possibly repeatedly. Look past the end of the
6997 item to see if there is repeat information following. The code is similar
6998 to that for character classes, but repeated for efficiency. Then obey
6999 similar code to character type repeats - written out again for speed.
7000 However, if the referenced string is the empty string, always treat
7001 it as matched, any number of times (otherwise there could be infinite
7006 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
7007 ecode += 3; /* Advance past item */
7009 /* If the reference is unset, set the length to be longer than the amount
7010 of subject left; this ensures that every attempt at a match fails. We
7011 can't just fail here, because of the possibility of quantifiers with zero
7014 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
7015 md->end_subject - eptr + 1 :
7016 md->offset_vector[offset+1] - md->offset_vector[offset];
7018 /* Set up for repetition, or handle the non-repeated case */
7028 c = *ecode++ - OP_CRSTAR;
7029 minimize = (c & 1) != 0;
7030 min = rep_min[c]; /* Pick up values from tables; */
7031 max = rep_max[c]; /* zero for max => infinity */
7032 if (max == 0) max = INT_MAX;
7037 minimize = (*ecode == OP_CRMINRANGE);
7038 min = GET2(ecode, 1);
7039 max = GET2(ecode, 3);
7040 if (max == 0) max = INT_MAX;
7044 default: /* No repeat follows */
7045 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7047 continue; /* With the main loop */
7050 /* If the length of the reference is zero, just continue with the
7053 if (length == 0) continue;
7055 /* First, ensure the minimum number of matches are present. We get back
7056 the length of the reference string explicitly rather than passing the
7057 address of eptr, so that eptr can be a register variable. */
7059 for (i = 1; i <= min; i++)
7061 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
7065 /* If min = max, continue at the same level without recursion.
7066 They are not both allowed to be zero. */
7068 if (min == max) continue;
7070 /* If minimizing, keep trying and advancing the pointer */
7074 for (fi = min;; fi++)
7076 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7077 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7078 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
7079 RRETURN(MATCH_NOMATCH);
7082 /* Control never gets here */
7085 /* If maximizing, find the longest string and work backwards */
7090 for (i = min; i < max; i++)
7092 if (!match_ref(offset, eptr, length, md, ims)) break;
7097 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7098 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7101 RRETURN(MATCH_NOMATCH);
7104 /* Control never gets here */
7108 /* Match a bit-mapped character class, possibly repeatedly. This op code is
7109 used when all the characters in the class have values in the range 0-255,
7110 and either the matching is caseful, or the characters are in the range
7111 0-127 when UTF-8 processing is enabled. The only difference between
7112 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
7115 First, look past the end of the item to see if there is repeat information
7116 following. Then obey similar code to character type repeats - written out
7122 data = ecode + 1; /* Save for matching */
7123 ecode += 33; /* Advance past the item */
7133 c = *ecode++ - OP_CRSTAR;
7134 minimize = (c & 1) != 0;
7135 min = rep_min[c]; /* Pick up values from tables; */
7136 max = rep_max[c]; /* zero for max => infinity */
7137 if (max == 0) max = INT_MAX;
7142 minimize = (*ecode == OP_CRMINRANGE);
7143 min = GET2(ecode, 1);
7144 max = GET2(ecode, 3);
7145 if (max == 0) max = INT_MAX;
7149 default: /* No repeat follows */
7154 /* First, ensure the minimum number of matches are present. */
7160 for (i = 1; i <= min; i++)
7162 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7163 GETCHARINC(c, eptr);
7166 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7170 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7176 /* Not UTF-8 mode */
7178 for (i = 1; i <= min; i++)
7180 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7182 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7186 /* If max == min we can continue with the main loop without the
7189 if (min == max) continue;
7191 /* If minimizing, keep testing the rest of the expression and advancing
7192 the pointer while it matches the class. */
7200 for (fi = min;; fi++)
7202 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7203 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7204 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7205 GETCHARINC(c, eptr);
7208 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
7212 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7218 /* Not UTF-8 mode */
7220 for (fi = min;; fi++)
7222 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7223 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7224 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7226 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
7229 /* Control never gets here */
7232 /* If maximizing, find the longest possible run, then work backwards. */
7242 for (i = min; i < max; i++)
7245 if (eptr >= md->end_subject) break;
7246 GETCHARLEN(c, eptr, len);
7249 if (op == OP_CLASS) break;
7253 if ((data[c/8] & (1 << (c&7))) == 0) break;
7259 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7260 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7261 if (eptr-- == pp) break; /* Stop if tried at original pos */
7267 /* Not UTF-8 mode */
7269 for (i = min; i < max; i++)
7271 if (eptr >= md->end_subject) break;
7273 if ((data[c/8] & (1 << (c&7))) == 0) break;
7278 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7280 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7284 RRETURN(MATCH_NOMATCH);
7287 /* Control never gets here */
7290 /* Match an extended character class. This opcode is encountered only
7291 in UTF-8 mode, because that's the only time it is compiled. */
7296 data = ecode + 1 + LINK_SIZE; /* Save for matching */
7297 ecode += GET(ecode, 1); /* Advance past the item */
7307 c = *ecode++ - OP_CRSTAR;
7308 minimize = (c & 1) != 0;
7309 min = rep_min[c]; /* Pick up values from tables; */
7310 max = rep_max[c]; /* zero for max => infinity */
7311 if (max == 0) max = INT_MAX;
7316 minimize = (*ecode == OP_CRMINRANGE);
7317 min = GET2(ecode, 1);
7318 max = GET2(ecode, 3);
7319 if (max == 0) max = INT_MAX;
7323 default: /* No repeat follows */
7328 /* First, ensure the minimum number of matches are present. */
7330 for (i = 1; i <= min; i++)
7332 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7333 GETCHARINC(c, eptr);
7334 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7337 /* If max == min we can continue with the main loop without the
7340 if (min == max) continue;
7342 /* If minimizing, keep testing the rest of the expression and advancing
7343 the pointer while it matches the class. */
7347 for (fi = min;; fi++)
7349 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7350 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7351 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7352 GETCHARINC(c, eptr);
7353 if (!match_xclass(c, data)) RRETURN(MATCH_NOMATCH);
7355 /* Control never gets here */
7358 /* If maximizing, find the longest possible run, then work backwards. */
7363 for (i = min; i < max; i++)
7366 if (eptr >= md->end_subject) break;
7367 GETCHARLEN(c, eptr, len);
7368 if (!match_xclass(c, data)) break;
7373 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7374 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7375 if (eptr-- == pp) break; /* Stop if tried at original pos */
7378 RRETURN(MATCH_NOMATCH);
7381 /* Control never gets here */
7383 #endif /* End of XCLASS */
7385 /* Match a single character, casefully */
7393 GETCHARLEN(fc, ecode, length);
7394 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7395 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
7400 /* Non-UTF-8 mode */
7402 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7403 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
7408 /* Match a single character, caselessly */
7416 GETCHARLEN(fc, ecode, length);
7418 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7420 /* If the pattern character's value is < 128, we have only one byte, and
7421 can use the fast lookup table. */
7425 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7428 /* Otherwise we must pick up the subject character */
7433 GETCHARINC(dc, eptr);
7436 /* If we have Unicode property support, we can use it to test the other
7437 case of the character, if there is one. The result of ucp_findchar() is
7438 < 0 if the char isn't found, and othercase is returned as zero if there
7446 if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
7448 RRETURN(MATCH_NOMATCH);
7453 #endif /* SUPPORT_UTF8 */
7455 /* Non-UTF-8 mode */
7457 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
7458 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7463 /* Match a single character repeatedly; different opcodes share code. */
7466 min = max = GET2(ecode, 1);
7473 max = GET2(ecode, 1);
7474 minimize = *ecode == OP_MINUPTO;
7484 c = *ecode++ - OP_STAR;
7485 minimize = (c & 1) != 0;
7486 min = rep_min[c]; /* Pick up values from tables; */
7487 max = rep_max[c]; /* zero for max => infinity */
7488 if (max == 0) max = INT_MAX;
7490 /* Common code for all repeated single-character matches. We can give
7491 up quickly if there are fewer than the minimum number of characters left in
7500 GETCHARLEN(fc, ecode, length);
7501 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7504 /* Handle multibyte character matching specially here. There is
7505 support for caseless matching if UCP support is present. */
7515 if ((ims & PCRE_CASELESS) != 0 &&
7516 ucp_findchar(fc, &chartype, &othercase) >= 0 &&
7518 oclength = ord2utf8(othercase, occhars);
7519 #endif /* SUPPORT_UCP */
7521 for (i = 1; i <= min; i++)
7523 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7524 /* Need braces because of following else */
7525 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7528 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7533 if (min == max) continue;
7537 for (fi = min;; fi++)
7539 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7540 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7541 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7542 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7543 /* Need braces because of following else */
7544 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
7547 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
7551 /* Control never gets here */
7556 for (i = min; i < max; i++)
7558 if (eptr > md->end_subject - length) break;
7559 if (memcmp(eptr, charptr, length) == 0) eptr += length;
7560 else if (oclength == 0) break;
7563 if (memcmp(eptr, occhars, oclength) != 0) break;
7569 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7573 RRETURN(MATCH_NOMATCH);
7575 /* Control never gets here */
7578 /* If the length of a UTF-8 character is 1, we fall through here, and
7579 obey the code as for non-UTF-8 characters below, though in this case the
7580 value of fc will always be < 128. */
7583 #endif /* SUPPORT_UTF8 */
7585 /* When not in UTF-8 mode, load a single-byte character. */
7587 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7591 /* The value of fc at this point is always less than 256, though we may or
7592 may not be in UTF-8 mode. The code is duplicated for the caseless and
7593 caseful cases, for speed, since matching characters is likely to be quite
7594 common. First, ensure the minimum number of matches are present. If min =
7595 max, continue at the same level without recursing. Otherwise, if
7596 minimizing, keep trying the rest of the expression and advancing one
7597 matching character if failing, up to the maximum. Alternatively, if
7598 maximizing, find the maximum number of characters and work backwards. */
7600 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7603 if ((ims & PCRE_CASELESS) != 0)
7606 for (i = 1; i <= min; i++)
7607 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7608 if (min == max) continue;
7611 for (fi = min;; fi++)
7613 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7615 if (fi >= max || eptr >= md->end_subject ||
7616 fc != md->lcc[*eptr++])
7617 RRETURN(MATCH_NOMATCH);
7619 /* Control never gets here */
7624 for (i = min; i < max; i++)
7626 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
7631 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7635 RRETURN(MATCH_NOMATCH);
7637 /* Control never gets here */
7640 /* Caseful comparisons (includes all multi-byte characters) */
7644 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
7645 if (min == max) continue;
7648 for (fi = min;; fi++)
7650 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7652 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
7653 RRETURN(MATCH_NOMATCH);
7655 /* Control never gets here */
7660 for (i = min; i < max; i++)
7662 if (eptr >= md->end_subject || fc != *eptr) break;
7667 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7671 RRETURN(MATCH_NOMATCH);
7674 /* Control never gets here */
7676 /* Match a negated single one-byte character. The character we are
7677 checking can be multibyte. */
7680 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
7682 GETCHARINCTEST(c, eptr);
7683 if ((ims & PCRE_CASELESS) != 0)
7689 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
7693 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
7697 /* Match a negated single one-byte character repeatedly. This is almost a
7698 repeat of the code for a repeated single character, but I haven't found a
7699 nice way of commoning these up that doesn't require a test of the
7700 positive/negative option for each character match. Maybe that wouldn't add
7701 very much to the time taken, but character matching *is* what this is all
7705 min = max = GET2(ecode, 1);
7712 max = GET2(ecode, 1);
7713 minimize = *ecode == OP_NOTMINUPTO;
7722 case OP_NOTMINQUERY:
7723 c = *ecode++ - OP_NOTSTAR;
7724 minimize = (c & 1) != 0;
7725 min = rep_min[c]; /* Pick up values from tables; */
7726 max = rep_max[c]; /* zero for max => infinity */
7727 if (max == 0) max = INT_MAX;
7729 /* Common code for all repeated single-byte matches. We can give up quickly
7730 if there are fewer than the minimum number of bytes left in the
7734 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
7737 /* The code is duplicated for the caseless and caseful cases, for speed,
7738 since matching characters is likely to be quite common. First, ensure the
7739 minimum number of matches are present. If min = max, continue at the same
7740 level without recursing. Otherwise, if minimizing, keep trying the rest of
7741 the expression and advancing one matching character if failing, up to the
7742 maximum. Alternatively, if maximizing, find the maximum number of
7743 characters and work backwards. */
7745 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
7748 if ((ims & PCRE_CASELESS) != 0)
7757 for (i = 1; i <= min; i++)
7759 GETCHARINC(d, eptr);
7760 if (d < 256) d = md->lcc[d];
7761 if (fc == d) RRETURN(MATCH_NOMATCH);
7767 /* Not UTF-8 mode */
7769 for (i = 1; i <= min; i++)
7770 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
7773 if (min == max) continue;
7782 for (fi = min;; fi++)
7784 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7786 GETCHARINC(d, eptr);
7787 if (d < 256) d = md->lcc[d];
7788 if (fi >= max || eptr >= md->end_subject || fc == d)
7789 RRETURN(MATCH_NOMATCH);
7794 /* Not UTF-8 mode */
7796 for (fi = min;; fi++)
7798 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7800 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
7801 RRETURN(MATCH_NOMATCH);
7804 /* Control never gets here */
7818 for (i = min; i < max; i++)
7821 if (eptr >= md->end_subject) break;
7822 GETCHARLEN(d, eptr, len);
7823 if (d < 256) d = md->lcc[d];
7829 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7831 if (eptr-- == pp) break; /* Stop if tried at original pos */
7837 /* Not UTF-8 mode */
7839 for (i = min; i < max; i++)
7841 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
7846 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7852 RRETURN(MATCH_NOMATCH);
7854 /* Control never gets here */
7857 /* Caseful comparisons */
7866 for (i = 1; i <= min; i++)
7868 GETCHARINC(d, eptr);
7869 if (fc == d) RRETURN(MATCH_NOMATCH);
7874 /* Not UTF-8 mode */
7876 for (i = 1; i <= min; i++)
7877 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
7880 if (min == max) continue;
7889 for (fi = min;; fi++)
7891 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7892 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7893 GETCHARINC(d, eptr);
7894 if (fi >= max || eptr >= md->end_subject || fc == d)
7895 RRETURN(MATCH_NOMATCH);
7900 /* Not UTF-8 mode */
7902 for (fi = min;; fi++)
7904 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7905 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7906 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
7907 RRETURN(MATCH_NOMATCH);
7910 /* Control never gets here */
7924 for (i = min; i < max; i++)
7927 if (eptr >= md->end_subject) break;
7928 GETCHARLEN(d, eptr, len);
7934 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7935 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7936 if (eptr-- == pp) break; /* Stop if tried at original pos */
7942 /* Not UTF-8 mode */
7944 for (i = min; i < max; i++)
7946 if (eptr >= md->end_subject || fc == *eptr) break;
7951 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
7952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
7957 RRETURN(MATCH_NOMATCH);
7960 /* Control never gets here */
7962 /* Match a single character type repeatedly; several different opcodes
7963 share code. This is very similar to the code for single characters, but we
7964 repeat it in the interests of efficiency. */
7967 min = max = GET2(ecode, 1);
7973 case OP_TYPEMINUPTO:
7975 max = GET2(ecode, 1);
7976 minimize = *ecode == OP_TYPEMINUPTO;
7981 case OP_TYPEMINSTAR:
7983 case OP_TYPEMINPLUS:
7985 case OP_TYPEMINQUERY:
7986 c = *ecode++ - OP_TYPESTAR;
7987 minimize = (c & 1) != 0;
7988 min = rep_min[c]; /* Pick up values from tables; */
7989 max = rep_max[c]; /* zero for max => infinity */
7990 if (max == 0) max = INT_MAX;
7992 /* Common code for all repeated single character type matches. Note that
7993 in UTF-8 mode, '.' matches a character of any length, but for the other
7994 character types, the valid characters are all one-byte long. */
7997 ctype = *ecode++; /* Code for the character type */
8000 if (ctype == OP_PROP || ctype == OP_NOTPROP)
8002 prop_fail_result = ctype == OP_NOTPROP;
8003 prop_type = *ecode++;
8004 if (prop_type >= 128)
8006 prop_test_against = prop_type - 128;
8007 prop_test_variable = &prop_category;
8011 prop_test_against = prop_type;
8012 prop_test_variable = &prop_chartype;
8015 else prop_type = -1;
8018 /* First, ensure the minimum number of matches are present. Use inline
8019 code for maximizing the speed, and do the type test once at the start
8020 (i.e. keep it out of the loop). Also we can test that there are at least
8021 the minimum number of bytes before we start. This isn't as effective in
8022 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
8023 is tidier. Also separate the UCP code, which can be the same for both UTF-8
8024 and single-bytes. */
8026 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
8032 for (i = 1; i <= min; i++)
8034 GETCHARINC(c, eptr);
8035 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8036 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8037 RRETURN(MATCH_NOMATCH);
8041 /* Match extended Unicode sequences. We will get here only if the
8042 support is in the binary; otherwise a compile-time error occurs. */
8044 else if (ctype == OP_EXTUNI)
8046 for (i = 1; i <= min; i++)
8048 GETCHARINCTEST(c, eptr);
8049 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8050 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8051 while (eptr < md->end_subject)
8054 if (!md->utf8) c = *eptr; else
8056 GETCHARLEN(c, eptr, len);
8058 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8059 if (prop_category != ucp_M) break;
8066 #endif /* SUPPORT_UCP */
8068 /* Handle all other cases when the coding is UTF-8 */
8071 if (md->utf8) switch(ctype)
8074 for (i = 1; i <= min; i++)
8076 if (eptr >= md->end_subject ||
8077 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
8078 RRETURN(MATCH_NOMATCH);
8079 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8088 for (i = 1; i <= min; i++)
8090 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8091 GETCHARINC(c, eptr);
8092 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
8093 RRETURN(MATCH_NOMATCH);
8098 for (i = 1; i <= min; i++)
8100 if (eptr >= md->end_subject ||
8101 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
8102 RRETURN(MATCH_NOMATCH);
8103 /* No need to skip more bytes - we know it's a 1-byte character */
8107 case OP_NOT_WHITESPACE:
8108 for (i = 1; i <= min; i++)
8110 if (eptr >= md->end_subject ||
8111 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
8112 RRETURN(MATCH_NOMATCH);
8113 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8118 for (i = 1; i <= min; i++)
8120 if (eptr >= md->end_subject ||
8121 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
8122 RRETURN(MATCH_NOMATCH);
8123 /* No need to skip more bytes - we know it's a 1-byte character */
8127 case OP_NOT_WORDCHAR:
8128 for (i = 1; i <= min; i++)
8130 if (eptr >= md->end_subject ||
8131 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
8132 RRETURN(MATCH_NOMATCH);
8133 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8138 for (i = 1; i <= min; i++)
8140 if (eptr >= md->end_subject ||
8141 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
8142 RRETURN(MATCH_NOMATCH);
8143 /* No need to skip more bytes - we know it's a 1-byte character */
8148 RRETURN(PCRE_ERROR_INTERNAL);
8149 } /* End switch(ctype) */
8152 #endif /* SUPPORT_UTF8 */
8154 /* Code for the non-UTF-8 case for minimum matching of operators other
8155 than OP_PROP and OP_NOTPROP. */
8160 if ((ims & PCRE_DOTALL) == 0)
8162 for (i = 1; i <= min; i++)
8163 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
8173 for (i = 1; i <= min; i++)
8174 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8178 for (i = 1; i <= min; i++)
8179 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8182 case OP_NOT_WHITESPACE:
8183 for (i = 1; i <= min; i++)
8184 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8188 for (i = 1; i <= min; i++)
8189 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8192 case OP_NOT_WORDCHAR:
8193 for (i = 1; i <= min; i++)
8194 if ((md->ctypes[*eptr++] & ctype_word) != 0)
8195 RRETURN(MATCH_NOMATCH);
8199 for (i = 1; i <= min; i++)
8200 if ((md->ctypes[*eptr++] & ctype_word) == 0)
8201 RRETURN(MATCH_NOMATCH);
8205 RRETURN(PCRE_ERROR_INTERNAL);
8209 /* If min = max, continue at the same level without recursing */
8211 if (min == max) continue;
8213 /* If minimizing, we have to test the rest of the pattern before each
8214 subsequent match. Again, separate the UTF-8 case for speed, and also
8215 separate the UCP cases. */
8222 for (fi = min;; fi++)
8224 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8225 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8226 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8227 GETCHARINC(c, eptr);
8228 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8229 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8230 RRETURN(MATCH_NOMATCH);
8234 /* Match extended Unicode sequences. We will get here only if the
8235 support is in the binary; otherwise a compile-time error occurs. */
8237 else if (ctype == OP_EXTUNI)
8239 for (fi = min;; fi++)
8241 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8242 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8243 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8244 GETCHARINCTEST(c, eptr);
8245 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8246 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
8247 while (eptr < md->end_subject)
8250 if (!md->utf8) c = *eptr; else
8252 GETCHARLEN(c, eptr, len);
8254 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8255 if (prop_category != ucp_M) break;
8262 #endif /* SUPPORT_UCP */
8268 for (fi = min;; fi++)
8270 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8271 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8272 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8274 GETCHARINC(c, eptr);
8278 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8285 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
8286 RRETURN(MATCH_NOMATCH);
8290 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
8291 RRETURN(MATCH_NOMATCH);
8294 case OP_NOT_WHITESPACE:
8295 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
8296 RRETURN(MATCH_NOMATCH);
8300 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
8301 RRETURN(MATCH_NOMATCH);
8304 case OP_NOT_WORDCHAR:
8305 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
8306 RRETURN(MATCH_NOMATCH);
8310 if (c >= 256 && (md->ctypes[c] & ctype_word) == 0)
8311 RRETURN(MATCH_NOMATCH);
8315 RRETURN(PCRE_ERROR_INTERNAL);
8321 /* Not UTF-8 mode */
8323 for (fi = min;; fi++)
8325 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8326 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8327 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
8332 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
8339 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
8343 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
8346 case OP_NOT_WHITESPACE:
8347 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
8351 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
8354 case OP_NOT_WORDCHAR:
8355 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
8359 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
8363 RRETURN(PCRE_ERROR_INTERNAL);
8367 /* Control never gets here */
8370 /* If maximizing it is worth using inline code for speed, doing the type
8371 test once at the start (i.e. keep it out of the loop). Again, keep the
8372 UTF-8 and UCP stuff separate. */
8376 pp = eptr; /* Remember where we started */
8381 for (i = min; i < max; i++)
8384 if (eptr >= md->end_subject) break;
8385 GETCHARLEN(c, eptr, len);
8386 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8387 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
8392 /* eptr is now past the end of the maximum run */
8396 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8397 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8398 if (eptr-- == pp) break; /* Stop if tried at original pos */
8403 /* Match extended Unicode sequences. We will get here only if the
8404 support is in the binary; otherwise a compile-time error occurs. */
8406 else if (ctype == OP_EXTUNI)
8408 for (i = min; i < max; i++)
8410 if (eptr >= md->end_subject) break;
8411 GETCHARINCTEST(c, eptr);
8412 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8413 if (prop_category == ucp_M) break;
8414 while (eptr < md->end_subject)
8417 if (!md->utf8) c = *eptr; else
8419 GETCHARLEN(c, eptr, len);
8421 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8422 if (prop_category != ucp_M) break;
8427 /* eptr is now past the end of the maximum run */
8431 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8432 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8433 if (eptr-- == pp) break; /* Stop if tried at original pos */
8434 for (;;) /* Move back over one extended */
8438 if (!md->utf8) c = *eptr; else
8440 GETCHARLEN(c, eptr, len);
8442 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
8443 if (prop_category != ucp_M) break;
8450 #endif /* SUPPORT_UCP */
8461 /* Special code is required for UTF8, but when the maximum is unlimited
8462 we don't need it, so we repeat the non-UTF8 code. This is probably
8463 worth it, because .* is quite a common idiom. */
8467 if ((ims & PCRE_DOTALL) == 0)
8469 for (i = min; i < max; i++)
8471 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8473 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8478 for (i = min; i < max; i++)
8481 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
8486 /* Handle unlimited UTF-8 repeat */
8490 if ((ims & PCRE_DOTALL) == 0)
8492 for (i = min; i < max; i++)
8494 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8502 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8508 /* The byte case is the same as non-UTF8 */
8512 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8517 for (i = min; i < max; i++)
8520 if (eptr >= md->end_subject) break;
8521 GETCHARLEN(c, eptr, len);
8522 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
8528 for (i = min; i < max; i++)
8531 if (eptr >= md->end_subject) break;
8532 GETCHARLEN(c, eptr, len);
8533 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
8538 case OP_NOT_WHITESPACE:
8539 for (i = min; i < max; i++)
8542 if (eptr >= md->end_subject) break;
8543 GETCHARLEN(c, eptr, len);
8544 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
8550 for (i = min; i < max; i++)
8553 if (eptr >= md->end_subject) break;
8554 GETCHARLEN(c, eptr, len);
8555 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
8560 case OP_NOT_WORDCHAR:
8561 for (i = min; i < max; i++)
8564 if (eptr >= md->end_subject) break;
8565 GETCHARLEN(c, eptr, len);
8566 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
8572 for (i = min; i < max; i++)
8575 if (eptr >= md->end_subject) break;
8576 GETCHARLEN(c, eptr, len);
8577 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
8583 RRETURN(PCRE_ERROR_INTERNAL);
8586 /* eptr is now past the end of the maximum run */
8590 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8592 if (eptr-- == pp) break; /* Stop if tried at original pos */
8599 /* Not UTF-8 mode */
8604 if ((ims & PCRE_DOTALL) == 0)
8606 for (i = min; i < max; i++)
8608 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
8613 /* For DOTALL case, fall through and treat as \C */
8617 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
8622 for (i = min; i < max; i++)
8624 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
8631 for (i = min; i < max; i++)
8633 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
8639 case OP_NOT_WHITESPACE:
8640 for (i = min; i < max; i++)
8642 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
8649 for (i = min; i < max; i++)
8651 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
8657 case OP_NOT_WORDCHAR:
8658 for (i = min; i < max; i++)
8660 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
8667 for (i = min; i < max; i++)
8669 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
8676 RRETURN(PCRE_ERROR_INTERNAL);
8679 /* eptr is now past the end of the maximum run */
8683 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
8685 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
8689 /* Get here if we can't make it match with any permitted repetitions */
8691 RRETURN(MATCH_NOMATCH);
8693 /* Control never gets here */
8695 /* There's been some horrible disaster. Since all codes > OP_BRA are
8696 for capturing brackets, and there shouldn't be any gaps between 0 and
8697 OP_BRA, arrival here can only mean there is something seriously wrong
8698 in the code above or the OP_xxx definitions. */
8701 DPRINTF(("Unknown opcode %d\n", *ecode));
8702 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
8705 /* Do not stick any code in here without much thought; it is assumed
8706 that "continue" in the code above comes out to here to repeat the main
8709 } /* End of main loop */
8710 /* Control never reaches here */
8714 /***************************************************************************
8715 ****************************************************************************
8716 RECURSION IN THE match() FUNCTION
8718 Undefine all the macros that were defined above to handle this. */
8736 #undef new_recursive
8752 #undef save_capture_last
8762 /* These two are defined as macros in both cases */
8767 /***************************************************************************
8768 ***************************************************************************/
8772 /*************************************************
8773 * Execute a Regular Expression *
8774 *************************************************/
8776 /* This function applies a compiled re to a subject string and picks out
8777 portions of the string if it matches. Two elements in the vector are set for
8778 each substring: the offsets to the start and end of the substring.
8781 argument_re points to the compiled expression
8782 extra_data points to extra data or is NULL
8783 subject points to the subject string
8784 length length of subject string (may contain binary zeros)
8785 start_offset where to start in the subject string
8787 offsets points to a vector of ints to be filled in with offsets
8788 offsetcount the number of elements in the vector
8790 Returns: > 0 => success; value is the number of elements filled in
8791 = 0 => success, but offsets is not big enough
8792 -1 => failed to match
8793 < -1 => some kind of unexpected problem
8797 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
8798 const char *subject, int length, int start_offset, int options, int *offsets,
8801 int rc, resetcount, ocount;
8802 int first_byte = -1;
8805 unsigned long int ims = 0;
8806 BOOL using_temporary_offsets = FALSE;
8809 BOOL first_byte_caseless = FALSE;
8810 BOOL req_byte_caseless = FALSE;
8811 match_data match_block;
8812 const uschar *tables;
8813 const uschar *start_bits = NULL;
8814 const uschar *start_match = (const uschar *)subject + start_offset;
8815 const uschar *end_subject;
8816 const uschar *req_byte_ptr = start_match - 1;
8818 pcre_study_data internal_study;
8819 const pcre_study_data *study;
8821 real_pcre internal_re;
8822 const real_pcre *external_re = (const real_pcre *)argument_re;
8823 const real_pcre *re = external_re;
8825 /* Plausibility checks */
8827 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
8828 if (re == NULL || subject == NULL ||
8829 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
8830 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
8832 /* Fish out the optional data from the extra_data structure, first setting
8833 the default values. */
8836 match_block.match_limit = MATCH_LIMIT;
8837 match_block.callout_data = NULL;
8839 /* The table pointer is always in native byte order. */
8841 tables = external_re->tables;
8843 if (extra_data != NULL)
8845 register unsigned int flags = extra_data->flags;
8846 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
8847 study = (const pcre_study_data *)extra_data->study_data;
8848 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
8849 match_block.match_limit = extra_data->match_limit;
8850 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
8851 match_block.callout_data = extra_data->callout_data;
8852 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
8855 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
8856 is a feature that makes it possible to save compiled regex and re-use them
8857 in other programs later. */
8859 if (tables == NULL) tables = pcre_default_tables;
8861 /* Check that the first field in the block is the magic number. If it is not,
8862 test for a regex that was compiled on a host of opposite endianness. If this is
8863 the case, flipped values are put in internal_re and internal_study if there was
8866 if (re->magic_number != MAGIC_NUMBER)
8868 re = try_flipped(re, &internal_re, study, &internal_study);
8869 if (re == NULL) return PCRE_ERROR_BADMAGIC;
8870 if (study != NULL) study = &internal_study;
8873 /* Set up other data */
8875 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
8876 startline = (re->options & PCRE_STARTLINE) != 0;
8878 /* The code starts after the real_pcre block and the capture name table. */
8880 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
8881 re->name_count * re->name_entry_size;
8883 match_block.start_subject = (const uschar *)subject;
8884 match_block.start_offset = start_offset;
8885 match_block.end_subject = match_block.start_subject + length;
8886 end_subject = match_block.end_subject;
8888 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
8889 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
8891 match_block.notbol = (options & PCRE_NOTBOL) != 0;
8892 match_block.noteol = (options & PCRE_NOTEOL) != 0;
8893 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
8894 match_block.partial = (options & PCRE_PARTIAL) != 0;
8895 match_block.hitend = FALSE;
8897 match_block.recursive = NULL; /* No recursion at top level */
8899 match_block.lcc = tables + lcc_offset;
8900 match_block.ctypes = tables + ctypes_offset;
8902 /* Partial matching is supported only for a restricted set of regexes at the
8905 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
8906 return PCRE_ERROR_BADPARTIAL;
8908 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
8909 back the character offset. */
8912 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
8914 if (valid_utf8((uschar *)subject, length) >= 0)
8915 return PCRE_ERROR_BADUTF8;
8916 if (start_offset > 0 && start_offset < length)
8918 int tb = ((uschar *)subject)[start_offset];
8922 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
8928 /* The ims options can vary during the matching as a result of the presence
8929 of (?ims) items in the pattern. They are kept in a local variable so that
8930 restoring at the exit of a group is easy. */
8932 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
8934 /* If the expression has got more back references than the offsets supplied can
8935 hold, we get a temporary chunk of working store to use during the matching.
8936 Otherwise, we can use the vector supplied, rounding down its size to a multiple
8939 ocount = offsetcount - (offsetcount % 3);
8941 if (re->top_backref > 0 && re->top_backref >= ocount/3)
8943 ocount = re->top_backref * 3 + 3;
8944 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
8945 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
8946 using_temporary_offsets = TRUE;
8947 DPRINTF(("Got memory to hold back references\n"));
8949 else match_block.offset_vector = offsets;
8951 match_block.offset_end = ocount;
8952 match_block.offset_max = (2*ocount)/3;
8953 match_block.offset_overflow = FALSE;
8954 match_block.capture_last = -1;
8956 /* Compute the minimum number of offsets that we need to reset each time. Doing
8957 this makes a huge difference to execution time when there aren't many brackets
8960 resetcount = 2 + re->top_bracket * 2;
8961 if (resetcount > offsetcount) resetcount = ocount;
8963 /* Reset the working variable associated with each extraction. These should
8964 never be used unless previously set, but they get saved and restored, and so we
8965 initialize them to avoid reading uninitialized locations. */
8967 if (match_block.offset_vector != NULL)
8969 register int *iptr = match_block.offset_vector + ocount;
8970 register int *iend = iptr - resetcount/2 + 1;
8971 while (--iptr >= iend) *iptr = -1;
8974 /* Set up the first character to match, if available. The first_byte value is
8975 never set for an anchored regular expression, but the anchoring may be forced
8976 at run time, so we have to test for anchoring. The first char may be unset for
8977 an unanchored pattern, of course. If there's no first char and the pattern was
8978 studied, there may be a bitmap of possible first characters. */
8982 if ((re->options & PCRE_FIRSTSET) != 0)
8984 first_byte = re->first_byte & 255;
8985 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
8986 first_byte = match_block.lcc[first_byte];
8989 if (!startline && study != NULL &&
8990 (study->options & PCRE_STUDY_MAPPED) != 0)
8991 start_bits = study->start_bits;
8994 /* For anchored or unanchored matches, there may be a "last known required
8997 if ((re->options & PCRE_REQCHSET) != 0)
8999 req_byte = re->req_byte & 255;
9000 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
9001 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
9004 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
9005 the loop runs just once. */
9009 /* Reset the maximum number of extractions we might see. */
9011 if (match_block.offset_vector != NULL)
9013 register int *iptr = match_block.offset_vector;
9014 register int *iend = iptr + resetcount;
9015 while (iptr < iend) *iptr++ = -1;
9018 /* Advance to a unique first char if possible */
9020 if (first_byte >= 0)
9022 if (first_byte_caseless)
9023 while (start_match < end_subject &&
9024 match_block.lcc[*start_match] != first_byte)
9027 while (start_match < end_subject && *start_match != first_byte)
9031 /* Or to just after \n for a multiline match if possible */
9035 if (start_match > match_block.start_subject + start_offset)
9037 while (start_match < end_subject && start_match[-1] != NEWLINE)
9042 /* Or to a non-unique first char after study */
9044 else if (start_bits != NULL)
9046 while (start_match < end_subject)
9048 register unsigned int c = *start_match;
9049 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
9053 #ifdef DEBUG /* Sigh. Some compilers never learn. */
9054 printf(">>>> Match against: ");
9055 pchars(start_match, end_subject - start_match, TRUE, &match_block);
9059 /* If req_byte is set, we know that that character must appear in the subject
9060 for the match to succeed. If the first character is set, req_byte must be
9061 later in the subject; otherwise the test starts at the match point. This
9062 optimization can save a huge amount of backtracking in patterns with nested
9063 unlimited repeats that aren't going to match. Writing separate code for
9064 cased/caseless versions makes it go faster, as does using an autoincrement
9065 and backing off on a match.
9067 HOWEVER: when the subject string is very, very long, searching to its end can
9068 take a long time, and give bad performance on quite ordinary patterns. This
9069 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
9070 don't do this when the string is sufficiently long.
9072 ALSO: this processing is disabled when partial matching is requested.
9075 if (req_byte >= 0 &&
9076 end_subject - start_match < REQ_BYTE_MAX &&
9077 !match_block.partial)
9079 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
9081 /* We don't need to repeat the search if we haven't yet reached the
9082 place we found it at last time. */
9084 if (p > req_byte_ptr)
9086 if (req_byte_caseless)
9088 while (p < end_subject)
9090 register int pp = *p++;
9091 if (pp == req_byte || pp == req_byte2) { p--; break; }
9096 while (p < end_subject)
9098 if (*p++ == req_byte) { p--; break; }
9102 /* If we can't find the required character, break the matching loop */
9104 if (p >= end_subject) break;
9106 /* If we have found the required character, save the point where we
9107 found it, so that we don't search again next time round the loop if
9108 the start hasn't passed this character yet. */
9114 /* When a match occurs, substrings will be set for all internal extractions;
9115 we just need to set up the whole thing as substring 0 before returning. If
9116 there were too many extractions, set the return code to zero. In the case
9117 where we had to get some local store to hold offsets for backreferences, copy
9118 those back references that we can. In this case there need not be overflow
9119 if certain parts of the pattern were not used. */
9121 match_block.start_match = start_match;
9122 match_block.match_call_count = 0;
9124 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
9127 if (rc == MATCH_NOMATCH)
9131 if (match_block.utf8)
9132 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
9138 if (rc != MATCH_MATCH)
9140 DPRINTF((">>>> error: returning %d\n", rc));
9144 /* We have a match! Copy the offset information from temporary store if
9147 if (using_temporary_offsets)
9149 if (offsetcount >= 4)
9151 memcpy(offsets + 2, match_block.offset_vector + 2,
9152 (offsetcount - 2) * sizeof(int));
9153 DPRINTF(("Copied offsets from temporary memory\n"));
9155 if (match_block.end_offset_top > offsetcount)
9156 match_block.offset_overflow = TRUE;
9158 DPRINTF(("Freeing temporary memory\n"));
9159 (pcre_free)(match_block.offset_vector);
9162 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
9164 if (offsetcount < 2) rc = 0; else
9166 offsets[0] = start_match - match_block.start_subject;
9167 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
9170 DPRINTF((">>>> returning %d\n", rc));
9174 /* This "while" is the end of the "do" above */
9176 while (!anchored && start_match <= end_subject);
9178 if (using_temporary_offsets)
9180 DPRINTF(("Freeing temporary memory\n"));
9181 (pcre_free)(match_block.offset_vector);
9184 if (match_block.partial && match_block.hitend)
9186 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
9187 return PCRE_ERROR_PARTIAL;
9191 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
9192 return PCRE_ERROR_NOMATCH;