1 /* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2005 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains pcre_exec(), the externally visible function that does
44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45 possible. There are also some static supporting functions. */
48 #include "pcre_internal.h"
51 /* Structure for building a chain of data that actually lives on the
52 stack, for holding the values of the subject pointer at the start of each
53 subpattern, so as to detect when an empty string has been matched by a
54 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
55 are on the heap, not on the stack. */
57 typedef struct eptrblock {
58 struct eptrblock *epb_prev;
59 const uschar *epb_saved_eptr;
62 /* Flag bits for the match() function */
64 #define match_condassert 0x01 /* Called to check a condition assertion */
65 #define match_isgroup 0x02 /* Set if start of bracketed group */
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
71 #define MATCH_NOMATCH 0
73 /* Maximum number of ints of offset to save on the stack for recursive calls.
74 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75 because the offset vector is always a multiple of 3 long. */
77 #define REC_STACK_SAVE_MAX 30
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
87 /*************************************************
88 * Debugging function to print chars *
89 *************************************************/
91 /* Print a sequence of chars in printable format, stopping at the end of the
92 subject if the requested.
95 p points to characters
96 length number to print
97 is_subject TRUE if printing from within md->start_subject
98 md pointer to matching data block, if is_subject is TRUE
104 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
107 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
109 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
115 /*************************************************
116 * Match a back-reference *
117 *************************************************/
119 /* If a back reference hasn't been set, the length that is passed is greater
120 than the number of characters left in the string, so the match fails.
123 offset index into the offset vector
124 eptr points into the subject
125 length length to be matched
126 md points to match data block
129 Returns: TRUE if matched
133 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
134 unsigned long int ims)
136 const uschar *p = md->start_subject + md->offset_vector[offset];
139 if (eptr >= md->end_subject)
140 printf("matching subject <null>");
143 printf("matching subject ");
144 pchars(eptr, length, TRUE, md);
146 printf(" against backref ");
147 pchars(p, length, FALSE, md);
151 /* Always fail if not enough characters left */
153 if (length > md->end_subject - eptr) return FALSE;
155 /* Separate the caselesss case for speed */
157 if ((ims & PCRE_CASELESS) != 0)
160 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
163 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170 /***************************************************************************
171 ****************************************************************************
172 RECURSION IN THE match() FUNCTION
174 The match() function is highly recursive. Some regular expressions can cause
175 it to recurse thousands of times. I was writing for Unix, so I just let it
176 call itself recursively. This uses the stack for saving everything that has
177 to be saved for a recursive call. On Unix, the stack can be large, and this
180 It turns out that on non-Unix systems there are problems with programs that
181 use a lot of stack. (This despite the fact that every last chip has oodles
182 of memory these days, and techniques for extending the stack have been known
185 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
186 calls by keeping local variables that need to be preserved in blocks of memory
187 obtained from malloc instead instead of on the stack. Macros are used to
188 achieve this so that the actual code doesn't look very different to what it
190 ****************************************************************************
191 ***************************************************************************/
194 /* These versions of the macros use the stack, as normal */
197 #define REGISTER register
198 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
199 #define RRETURN(ra) return ra
203 /* These versions of the macros manage a private stack on the heap. Note
204 that the rd argument of RMATCH isn't actually used. It's the md argument of
205 match(), which never changes. */
209 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
211 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
212 if (setjmp(frame->Xwhere) == 0)\
214 newframe->Xeptr = ra;\
215 newframe->Xecode = rb;\
216 newframe->Xoffset_top = rc;\
217 newframe->Xims = re;\
218 newframe->Xeptrb = rf;\
219 newframe->Xflags = rg;\
220 newframe->Xprevframe = frame;\
222 DPRINTF(("restarting from line %d\n", __LINE__));\
227 DPRINTF(("longjumped back to line %d\n", __LINE__));\
228 frame = md->thisframe;\
229 rx = frame->Xresult;\
235 heapframe *newframe = frame;\
236 frame = newframe->Xprevframe;\
237 (pcre_stack_free)(newframe);\
240 frame->Xresult = ra;\
241 md->thisframe = frame;\
242 longjmp(frame->Xwhere, 1);\
248 /* Structure for remembering the local variables in a private frame */
250 typedef struct heapframe {
251 struct heapframe *Xprevframe;
253 /* Function arguments that may change */
256 const uschar *Xecode;
262 /* Function local variables */
264 const uschar *Xcallpat;
265 const uschar *Xcharptr;
270 const uschar *Xsaved_eptr;
272 recursion_info Xnew_recursive;
279 unsigned long int Xoriginal_ims;
283 int Xprop_fail_result;
287 int Xprop_test_against;
288 int *Xprop_test_variable;
300 int Xsave_capture_last;
301 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
302 int Xstacksave[REC_STACK_SAVE_MAX];
306 /* Place to pass back result, and where to jump back to */
316 /***************************************************************************
317 ***************************************************************************/
321 /*************************************************
322 * Match from current position *
323 *************************************************/
325 /* On entry ecode points to the first opcode, and eptr to the first character
326 in the subject string, while eptrb holds the value of eptr at the start of the
327 last bracketed group - used for breaking infinite loops matching zero-length
328 strings. This function is called recursively in many circumstances. Whenever it
329 returns a negative (error) response, the outer incarnation must also return the
332 Performance note: It might be tempting to extract commonly used fields from the
333 md structure (e.g. utf8, end_subject) into individual variables to improve
334 performance. Tests using gcc on a SPARC disproved this; in the first case, it
335 made performance worse.
338 eptr pointer in subject
339 ecode position in code
340 offset_top current top pointer
341 md pointer to "static" info for the match
342 ims current /i, /m, and /s options
343 eptrb pointer to chain of blocks containing eptr at start of
344 brackets - for testing for empty matches
346 match_condassert - this is an assertion condition
347 match_isgroup - this is the start of a bracketed group
349 Returns: MATCH_MATCH if matched ) these values are >= 0
350 MATCH_NOMATCH if failed to match )
351 a negative PCRE_ERROR_xxx value if aborted by an error condition
352 (e.g. stopped by recursion limit)
356 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
357 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
360 /* These variables do not need to be preserved over recursion in this function,
361 so they can be ordinary variables in all cases. Mark them with "register"
362 because they are used a lot in loops. */
364 register int rrc; /* Returns from recursive calls */
365 register int i; /* Used for loops not involving calls to RMATCH() */
366 register int c; /* Character values not kept over RMATCH() calls */
367 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
369 /* When recursion is not being used, all "local" variables that have to be
370 preserved over calls to RMATCH() are part of a "frame" which is obtained from
371 heap storage. Set up the top-level frame here; others are obtained from the
372 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
375 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
376 frame->Xprevframe = NULL; /* Marks the top level */
378 /* Copy in the original argument variables */
381 frame->Xecode = ecode;
382 frame->Xoffset_top = offset_top;
384 frame->Xeptrb = eptrb;
385 frame->Xflags = flags;
387 /* This is where control jumps back to to effect "recursion" */
391 /* Macros make the argument variables come from the current frame */
393 #define eptr frame->Xeptr
394 #define ecode frame->Xecode
395 #define offset_top frame->Xoffset_top
396 #define ims frame->Xims
397 #define eptrb frame->Xeptrb
398 #define flags frame->Xflags
400 /* Ditto for the local variables */
403 #define charptr frame->Xcharptr
405 #define callpat frame->Xcallpat
406 #define data frame->Xdata
407 #define next frame->Xnext
408 #define pp frame->Xpp
409 #define prev frame->Xprev
410 #define saved_eptr frame->Xsaved_eptr
412 #define new_recursive frame->Xnew_recursive
414 #define cur_is_word frame->Xcur_is_word
415 #define condition frame->Xcondition
416 #define minimize frame->Xminimize
417 #define prev_is_word frame->Xprev_is_word
419 #define original_ims frame->Xoriginal_ims
422 #define prop_type frame->Xprop_type
423 #define prop_fail_result frame->Xprop_fail_result
424 #define prop_category frame->Xprop_category
425 #define prop_chartype frame->Xprop_chartype
426 #define prop_othercase frame->Xprop_othercase
427 #define prop_test_against frame->Xprop_test_against
428 #define prop_test_variable frame->Xprop_test_variable
431 #define ctype frame->Xctype
432 #define fc frame->Xfc
433 #define fi frame->Xfi
434 #define length frame->Xlength
435 #define max frame->Xmax
436 #define min frame->Xmin
437 #define number frame->Xnumber
438 #define offset frame->Xoffset
439 #define op frame->Xop
440 #define save_capture_last frame->Xsave_capture_last
441 #define save_offset1 frame->Xsave_offset1
442 #define save_offset2 frame->Xsave_offset2
443 #define save_offset3 frame->Xsave_offset3
444 #define stacksave frame->Xstacksave
446 #define newptrb frame->Xnewptrb
448 /* When recursion is being used, local variables are allocated on the stack and
449 get preserved during recursion in the normal way. In this environment, fi and
450 i, and fc and c, can be the same variables. */
457 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
458 const uschar *charptr; /* small blocks of the code. My normal */
459 #endif /* style of coding would have declared */
460 const uschar *callpat; /* them within each of those blocks. */
461 const uschar *data; /* However, in order to accommodate the */
462 const uschar *next; /* version of this code that uses an */
463 const uschar *pp; /* external "stack" implemented on the */
464 const uschar *prev; /* heap, it is easier to declare them */
465 const uschar *saved_eptr; /* all here, so the declarations can */
466 /* be cut out in a block. The only */
467 recursion_info new_recursive; /* declarations within blocks below are */
468 /* for variables that do not have to */
469 BOOL cur_is_word; /* be preserved over a recursive call */
470 BOOL condition; /* to RMATCH(). */
474 unsigned long int original_ims;
478 int prop_fail_result;
482 int prop_test_against;
483 int *prop_test_variable;
493 int save_capture_last;
494 int save_offset1, save_offset2, save_offset3;
495 int stacksave[REC_STACK_SAVE_MAX];
500 /* These statements are here to stop the compiler complaining about unitialized
504 prop_fail_result = 0;
505 prop_test_against = 0;
506 prop_test_variable = NULL;
509 /* OK, now we can get on with the real code of the function. Recursion is
510 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
511 these just turn into a recursive call to match() and a "return", respectively.
512 However, RMATCH isn't like a function call because it's quite a complicated
513 macro. It has to be used in one particular way. This shouldn't, however, impact
514 performance when true recursion is being used. */
516 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
518 original_ims = ims; /* Save for resetting on ')' */
519 utf8 = md->utf8; /* Local copy of the flag */
521 /* At the start of a bracketed group, add the current subject pointer to the
522 stack of such pointers, to be re-instated at the end of the group when we hit
523 the closing ket. When match() is called in other circumstances, we don't add to
526 if ((flags & match_isgroup) != 0)
528 newptrb.epb_prev = eptrb;
529 newptrb.epb_saved_eptr = eptr;
533 /* Now start processing the operations. */
540 /* For partial matching, remember if we ever hit the end of the subject after
541 matching at least one subject character. */
544 eptr >= md->end_subject &&
545 eptr > md->start_match)
548 /* Opening capturing bracket. If there is space in the offset vector, save
549 the current subject position in the working slot at the top of the vector. We
550 mustn't change the current values of the data slot, because they may be set
551 from a previous iteration of this group, and be referred to by a reference
554 If the bracket fails to match, we need to restore this value and also the
555 values of the final offsets, in case they were set by a previous iteration of
558 If there isn't enough space in the offset vector, treat this as if it were a
559 non-capturing bracket. Don't worry about setting the flag for the error case
560 here; that is handled in the code for KET. */
564 number = op - OP_BRA;
566 /* For extended extraction brackets (large number), we have to fish out the
567 number from a dummy opcode at the start. */
569 if (number > EXTRACT_BASIC_MAX)
570 number = GET2(ecode, 2+LINK_SIZE);
571 offset = number << 1;
574 printf("start bracket %d subject=", number);
575 pchars(eptr, 16, TRUE, md);
579 if (offset < md->offset_max)
581 save_offset1 = md->offset_vector[offset];
582 save_offset2 = md->offset_vector[offset+1];
583 save_offset3 = md->offset_vector[md->offset_end - number];
584 save_capture_last = md->capture_last;
586 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
587 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
591 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
593 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
594 md->capture_last = save_capture_last;
595 ecode += GET(ecode, 1);
597 while (*ecode == OP_ALT);
599 DPRINTF(("bracket %d failed\n", number));
601 md->offset_vector[offset] = save_offset1;
602 md->offset_vector[offset+1] = save_offset2;
603 md->offset_vector[md->offset_end - number] = save_offset3;
605 RRETURN(MATCH_NOMATCH);
608 /* Insufficient room for saving captured contents */
613 /* Other types of node can be handled by a switch */
617 case OP_BRA: /* Non-capturing bracket: optimized */
618 DPRINTF(("start bracket 0\n"));
621 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
623 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
624 ecode += GET(ecode, 1);
626 while (*ecode == OP_ALT);
627 DPRINTF(("bracket 0 failed\n"));
628 RRETURN(MATCH_NOMATCH);
630 /* Conditional group: compilation checked that there are no more than
631 two branches. If the condition is false, skipping the first branch takes us
632 past the end if there is only one branch, but that's OK because that is
633 exactly what going to the ket would do. */
636 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
638 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
639 condition = (offset == CREF_RECURSE * 2)?
640 (md->recursive != NULL) :
641 (offset < offset_top && md->offset_vector[offset] >= 0);
642 RMATCH(rrc, eptr, ecode + (condition?
643 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
644 offset_top, md, ims, eptrb, match_isgroup);
648 /* The condition is an assertion. Call match() to evaluate it - setting
649 the final argument TRUE causes it to stop at the end of an assertion. */
653 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
654 match_condassert | match_isgroup);
655 if (rrc == MATCH_MATCH)
657 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
658 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
660 else if (rrc != MATCH_NOMATCH)
662 RRETURN(rrc); /* Need braces because of following else */
664 else ecode += GET(ecode, 1);
665 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
669 /* Control never reaches here */
671 /* Skip over conditional reference or large extraction number data if
679 /* End of the pattern. If we are in a recursion, we should restore the
680 offsets appropriately and continue from after the call. */
683 if (md->recursive != NULL && md->recursive->group_num == 0)
685 recursion_info *rec = md->recursive;
686 DPRINTF(("Hit the end in a (?0) recursion\n"));
687 md->recursive = rec->prevrec;
688 memmove(md->offset_vector, rec->offset_save,
689 rec->saved_max * sizeof(int));
690 md->start_match = rec->save_start;
692 ecode = rec->after_call;
696 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
697 string - backtracking will then try other alternatives, if any. */
699 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
700 md->end_match_ptr = eptr; /* Record where we ended */
701 md->end_offset_top = offset_top; /* and how many extracts were taken */
702 RRETURN(MATCH_MATCH);
704 /* Change option settings */
709 DPRINTF(("ims set to %02lx\n", ims));
712 /* Assertion brackets. Check the alternative branches in turn - the
713 matching won't pass the KET for an assertion. If any one branch matches,
714 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
715 start of each branch to move the current point backwards, so the code at
716 this level is identical to the lookahead case. */
722 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
724 if (rrc == MATCH_MATCH) break;
725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
726 ecode += GET(ecode, 1);
728 while (*ecode == OP_ALT);
729 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
731 /* If checking an assertion for a condition, return MATCH_MATCH. */
733 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
735 /* Continue from after the assertion, updating the offsets high water
736 mark, since extracts may have been taken during the assertion. */
738 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
739 ecode += 1 + LINK_SIZE;
740 offset_top = md->end_offset_top;
743 /* Negative assertion: all branches must fail to match */
746 case OP_ASSERTBACK_NOT:
749 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
751 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
753 ecode += GET(ecode,1);
755 while (*ecode == OP_ALT);
757 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
759 ecode += 1 + LINK_SIZE;
762 /* Move the subject pointer back. This occurs only at the start of
763 each branch of a lookbehind assertion. If we are too close to the start to
764 move back, this match function fails. When working with UTF-8 we move
765 back a number of characters, not bytes. */
772 for (i = 0; i < c; i++)
775 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
782 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
785 eptr -= GET(ecode,1);
786 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
789 /* Skip to next op code */
791 ecode += 1 + LINK_SIZE;
794 /* The callout item calls an external function, if one is provided, passing
795 details of the match so far. This is mainly for debugging, though the
796 function is able to force a failure. */
799 if (pcre_callout != NULL)
801 pcre_callout_block cb;
802 cb.version = 1; /* Version 1 of the callout block */
803 cb.callout_number = ecode[1];
804 cb.offset_vector = md->offset_vector;
805 cb.subject = (const char *)md->start_subject;
806 cb.subject_length = md->end_subject - md->start_subject;
807 cb.start_match = md->start_match - md->start_subject;
808 cb.current_position = eptr - md->start_subject;
809 cb.pattern_position = GET(ecode, 2);
810 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
811 cb.capture_top = offset_top/2;
812 cb.capture_last = md->capture_last;
813 cb.callout_data = md->callout_data;
814 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
815 if (rrc < 0) RRETURN(rrc);
817 ecode += 2 + 2*LINK_SIZE;
820 /* Recursion either matches the current regex, or some subexpression. The
821 offset data is the offset to the starting bracket from the start of the
822 whole pattern. (This is so that it works from duplicated subpatterns.)
824 If there are any capturing brackets started but not finished, we have to
825 save their starting points and reinstate them after the recursion. However,
826 we don't know how many such there are (offset_top records the completed
827 total) so we just have to save all the potential data. There may be up to
828 65535 such values, which is too large to put on the stack, but using malloc
829 for small numbers seems expensive. As a compromise, the stack is used when
830 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
831 is used. A problem is what to do if the malloc fails ... there is no way of
832 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
833 values on the stack, and accept that the rest may be wrong.
835 There are also other values that have to be saved. We use a chained
836 sequence of blocks that actually live on the stack. Thanks to Robin Houston
837 for the original version of this logic. */
841 callpat = md->start_code + GET(ecode, 1);
842 new_recursive.group_num = *callpat - OP_BRA;
844 /* For extended extraction brackets (large number), we have to fish out
845 the number from a dummy opcode at the start. */
847 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
848 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
850 /* Add to "recursing stack" */
852 new_recursive.prevrec = md->recursive;
853 md->recursive = &new_recursive;
855 /* Find where to continue from afterwards */
857 ecode += 1 + LINK_SIZE;
858 new_recursive.after_call = ecode;
860 /* Now save the offset data. */
862 new_recursive.saved_max = md->offset_end;
863 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
864 new_recursive.offset_save = stacksave;
867 new_recursive.offset_save =
868 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
869 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
872 memcpy(new_recursive.offset_save, md->offset_vector,
873 new_recursive.saved_max * sizeof(int));
874 new_recursive.save_start = md->start_match;
875 md->start_match = eptr;
877 /* OK, now we can do the recursion. For each top-level alternative we
878 restore the offset and recursion data. */
880 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
883 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
884 eptrb, match_isgroup);
885 if (rrc == MATCH_MATCH)
887 md->recursive = new_recursive.prevrec;
888 if (new_recursive.offset_save != stacksave)
889 (pcre_free)(new_recursive.offset_save);
890 RRETURN(MATCH_MATCH);
892 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894 md->recursive = &new_recursive;
895 memcpy(md->offset_vector, new_recursive.offset_save,
896 new_recursive.saved_max * sizeof(int));
897 callpat += GET(callpat, 1);
899 while (*callpat == OP_ALT);
901 DPRINTF(("Recursion didn't match\n"));
902 md->recursive = new_recursive.prevrec;
903 if (new_recursive.offset_save != stacksave)
904 (pcre_free)(new_recursive.offset_save);
905 RRETURN(MATCH_NOMATCH);
907 /* Control never reaches here */
909 /* "Once" brackets are like assertion brackets except that after a match,
910 the point in the subject string is not moved back. Thus there can never be
911 a move back into the brackets. Friedl calls these "atomic" subpatterns.
912 Check the alternative branches in turn - the matching won't pass the KET
913 for this kind of subpattern. If any one branch matches, we carry on as at
914 the end of a normal bracket, leaving the subject pointer. */
923 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
924 eptrb, match_isgroup);
925 if (rrc == MATCH_MATCH) break;
926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
927 ecode += GET(ecode,1);
929 while (*ecode == OP_ALT);
931 /* If hit the end of the group (which could be repeated), fail */
933 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
935 /* Continue as from after the assertion, updating the offsets high water
936 mark, since extracts may have been taken. */
938 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
940 offset_top = md->end_offset_top;
941 eptr = md->end_match_ptr;
943 /* For a non-repeating ket, just continue at this level. This also
944 happens for a repeating ket if no characters were matched in the group.
945 This is the forcible breaking of infinite loops as implemented in Perl
946 5.005. If there is an options reset, it will get obeyed in the normal
949 if (*ecode == OP_KET || eptr == saved_eptr)
951 ecode += 1+LINK_SIZE;
955 /* The repeating kets try the rest of the pattern or restart from the
956 preceding bracket, in the appropriate order. We need to reset any options
957 that changed within the bracket before re-running it, so check the next
960 if (ecode[1+LINK_SIZE] == OP_OPT)
962 ims = (ims & ~PCRE_IMS) | ecode[4];
963 DPRINTF(("ims set to %02lx at group repeat\n", ims));
966 if (*ecode == OP_KETRMIN)
968 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
970 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
971 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
973 else /* OP_KETRMAX */
975 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
976 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
977 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
981 RRETURN(MATCH_NOMATCH);
983 /* An alternation is the end of a branch; scan along to find the end of the
984 bracketed group and go to there. */
987 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
990 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
991 that it may occur zero times. It may repeat infinitely, or not at all -
992 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
993 repeat limits are compiled as a number of copies, with the optional ones
994 preceded by BRAZERO or BRAMINZERO. */
999 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
1000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1001 do next += GET(next,1); while (*next == OP_ALT);
1002 ecode = next + 1+LINK_SIZE;
1009 do next += GET(next,1); while (*next == OP_ALT);
1010 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1017 /* End of a group, repeated or non-repeating. If we are at the end of
1018 an assertion "group", stop matching and return MATCH_MATCH, but record the
1019 current high water mark for use by positive assertions. Do this also
1020 for the "once" (not-backup up) groups. */
1026 prev = ecode - GET(ecode, 1);
1027 saved_eptr = eptrb->epb_saved_eptr;
1029 /* Back up the stack of bracket start pointers. */
1031 eptrb = eptrb->epb_prev;
1033 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1034 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1037 md->end_match_ptr = eptr; /* For ONCE */
1038 md->end_offset_top = offset_top;
1039 RRETURN(MATCH_MATCH);
1042 /* In all other cases except a conditional group we have to check the
1043 group number back at the start and if necessary complete handling an
1044 extraction by setting the offsets and bumping the high water mark. */
1046 if (*prev != OP_COND)
1048 number = *prev - OP_BRA;
1050 /* For extended extraction brackets (large number), we have to fish out
1051 the number from a dummy opcode at the start. */
1053 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1054 offset = number << 1;
1057 printf("end bracket %d", number);
1061 /* Test for a numbered group. This includes groups called as a result
1062 of recursion. Note that whole-pattern recursion is coded as a recurse
1063 into group 0, so it won't be picked up here. Instead, we catch it when
1064 the OP_END is reached. */
1068 md->capture_last = number;
1069 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1071 md->offset_vector[offset] =
1072 md->offset_vector[md->offset_end - number];
1073 md->offset_vector[offset+1] = eptr - md->start_subject;
1074 if (offset_top <= offset) offset_top = offset + 2;
1077 /* Handle a recursively called group. Restore the offsets
1078 appropriately and continue from after the call. */
1080 if (md->recursive != NULL && md->recursive->group_num == number)
1082 recursion_info *rec = md->recursive;
1083 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1084 md->recursive = rec->prevrec;
1085 md->start_match = rec->save_start;
1086 memcpy(md->offset_vector, rec->offset_save,
1087 rec->saved_max * sizeof(int));
1088 ecode = rec->after_call;
1095 /* Reset the value of the ims flags, in case they got changed during
1099 DPRINTF(("ims reset to %02lx\n", ims));
1101 /* For a non-repeating ket, just continue at this level. This also
1102 happens for a repeating ket if no characters were matched in the group.
1103 This is the forcible breaking of infinite loops as implemented in Perl
1104 5.005. If there is an options reset, it will get obeyed in the normal
1105 course of events. */
1107 if (*ecode == OP_KET || eptr == saved_eptr)
1109 ecode += 1 + LINK_SIZE;
1113 /* The repeating kets try the rest of the pattern or restart from the
1114 preceding bracket, in the appropriate order. */
1116 if (*ecode == OP_KETRMIN)
1118 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1119 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1121 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1123 else /* OP_KETRMAX */
1125 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1127 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1128 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1132 RRETURN(MATCH_NOMATCH);
1134 /* Start of subject unless notbol, or after internal newline if multiline */
1137 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1138 if ((ims & PCRE_MULTILINE) != 0)
1140 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
1141 RRETURN(MATCH_NOMATCH);
1145 /* ... else fall through */
1147 /* Start of subject assertion */
1150 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1154 /* Start of match assertion */
1157 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1161 /* Assert before internal newline if multiline, or before a terminating
1162 newline unless endonly is set, else end of subject unless noteol is set. */
1165 if ((ims & PCRE_MULTILINE) != 0)
1167 if (eptr < md->end_subject)
1168 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
1170 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1176 if (md->noteol) RRETURN(MATCH_NOMATCH);
1179 if (eptr < md->end_subject - 1 ||
1180 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
1181 RRETURN(MATCH_NOMATCH);
1186 /* ... else fall through */
1188 /* End of subject assertion (\z) */
1191 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1195 /* End of subject or ending \n assertion (\Z) */
1198 if (eptr < md->end_subject - 1 ||
1199 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
1203 /* Word boundary assertions */
1205 case OP_NOT_WORD_BOUNDARY:
1206 case OP_WORD_BOUNDARY:
1209 /* Find out if the previous and current characters are "word" characters.
1210 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1211 be "non-word" characters. */
1216 if (eptr == md->start_subject) prev_is_word = FALSE; else
1218 const uschar *lastptr = eptr - 1;
1219 while((*lastptr & 0xc0) == 0x80) lastptr--;
1220 GETCHAR(c, lastptr);
1221 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1223 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1226 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1232 /* More streamlined when not in UTF-8 mode */
1235 prev_is_word = (eptr != md->start_subject) &&
1236 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1237 cur_is_word = (eptr < md->end_subject) &&
1238 ((md->ctypes[*eptr] & ctype_word) != 0);
1241 /* Now see if the situation is what we want */
1243 if ((*ecode++ == OP_WORD_BOUNDARY)?
1244 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1245 RRETURN(MATCH_NOMATCH);
1249 /* Match a single character type; inline for speed */
1252 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
1253 RRETURN(MATCH_NOMATCH);
1254 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1257 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1262 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1263 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1266 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1271 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1272 GETCHARINCTEST(c, eptr);
1277 (md->ctypes[c] & ctype_digit) != 0
1279 RRETURN(MATCH_NOMATCH);
1284 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1285 GETCHARINCTEST(c, eptr);
1290 (md->ctypes[c] & ctype_digit) == 0
1292 RRETURN(MATCH_NOMATCH);
1296 case OP_NOT_WHITESPACE:
1297 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1298 GETCHARINCTEST(c, eptr);
1303 (md->ctypes[c] & ctype_space) != 0
1305 RRETURN(MATCH_NOMATCH);
1310 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1311 GETCHARINCTEST(c, eptr);
1316 (md->ctypes[c] & ctype_space) == 0
1318 RRETURN(MATCH_NOMATCH);
1322 case OP_NOT_WORDCHAR:
1323 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1324 GETCHARINCTEST(c, eptr);
1329 (md->ctypes[c] & ctype_word) != 0
1331 RRETURN(MATCH_NOMATCH);
1336 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1337 GETCHARINCTEST(c, eptr);
1342 (md->ctypes[c] & ctype_word) == 0
1344 RRETURN(MATCH_NOMATCH);
1349 /* Check the next character by Unicode property. We will get here only
1350 if the support is in the binary; otherwise a compile-time error occurs. */
1354 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1355 GETCHARINCTEST(c, eptr);
1357 int chartype, rqdtype;
1359 int category = ucp_findchar(c, &chartype, &othercase);
1361 rqdtype = *(++ecode);
1366 if ((rqdtype - 128 != category) == (op == OP_PROP))
1367 RRETURN(MATCH_NOMATCH);
1371 if ((rqdtype != chartype) == (op == OP_PROP))
1372 RRETURN(MATCH_NOMATCH);
1377 /* Match an extended Unicode sequence. We will get here only if the support
1378 is in the binary; otherwise a compile-time error occurs. */
1381 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1382 GETCHARINCTEST(c, eptr);
1386 int category = ucp_findchar(c, &chartype, &othercase);
1387 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1388 while (eptr < md->end_subject)
1391 if (!utf8) c = *eptr; else
1393 GETCHARLEN(c, eptr, len);
1395 category = ucp_findchar(c, &chartype, &othercase);
1396 if (category != ucp_M) break;
1405 /* Match a back reference, possibly repeatedly. Look past the end of the
1406 item to see if there is repeat information following. The code is similar
1407 to that for character classes, but repeated for efficiency. Then obey
1408 similar code to character type repeats - written out again for speed.
1409 However, if the referenced string is the empty string, always treat
1410 it as matched, any number of times (otherwise there could be infinite
1415 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1416 ecode += 3; /* Advance past item */
1418 /* If the reference is unset, set the length to be longer than the amount
1419 of subject left; this ensures that every attempt at a match fails. We
1420 can't just fail here, because of the possibility of quantifiers with zero
1423 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1424 md->end_subject - eptr + 1 :
1425 md->offset_vector[offset+1] - md->offset_vector[offset];
1427 /* Set up for repetition, or handle the non-repeated case */
1437 c = *ecode++ - OP_CRSTAR;
1438 minimize = (c & 1) != 0;
1439 min = rep_min[c]; /* Pick up values from tables; */
1440 max = rep_max[c]; /* zero for max => infinity */
1441 if (max == 0) max = INT_MAX;
1446 minimize = (*ecode == OP_CRMINRANGE);
1447 min = GET2(ecode, 1);
1448 max = GET2(ecode, 3);
1449 if (max == 0) max = INT_MAX;
1453 default: /* No repeat follows */
1454 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1456 continue; /* With the main loop */
1459 /* If the length of the reference is zero, just continue with the
1462 if (length == 0) continue;
1464 /* First, ensure the minimum number of matches are present. We get back
1465 the length of the reference string explicitly rather than passing the
1466 address of eptr, so that eptr can be a register variable. */
1468 for (i = 1; i <= min; i++)
1470 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1474 /* If min = max, continue at the same level without recursion.
1475 They are not both allowed to be zero. */
1477 if (min == max) continue;
1479 /* If minimizing, keep trying and advancing the pointer */
1483 for (fi = min;; fi++)
1485 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1486 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1487 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1488 RRETURN(MATCH_NOMATCH);
1491 /* Control never gets here */
1494 /* If maximizing, find the longest string and work backwards */
1499 for (i = min; i < max; i++)
1501 if (!match_ref(offset, eptr, length, md, ims)) break;
1506 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1507 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1510 RRETURN(MATCH_NOMATCH);
1513 /* Control never gets here */
1517 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1518 used when all the characters in the class have values in the range 0-255,
1519 and either the matching is caseful, or the characters are in the range
1520 0-127 when UTF-8 processing is enabled. The only difference between
1521 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1524 First, look past the end of the item to see if there is repeat information
1525 following. Then obey similar code to character type repeats - written out
1531 data = ecode + 1; /* Save for matching */
1532 ecode += 33; /* Advance past the item */
1542 c = *ecode++ - OP_CRSTAR;
1543 minimize = (c & 1) != 0;
1544 min = rep_min[c]; /* Pick up values from tables; */
1545 max = rep_max[c]; /* zero for max => infinity */
1546 if (max == 0) max = INT_MAX;
1551 minimize = (*ecode == OP_CRMINRANGE);
1552 min = GET2(ecode, 1);
1553 max = GET2(ecode, 3);
1554 if (max == 0) max = INT_MAX;
1558 default: /* No repeat follows */
1563 /* First, ensure the minimum number of matches are present. */
1569 for (i = 1; i <= min; i++)
1571 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1572 GETCHARINC(c, eptr);
1575 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1579 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1585 /* Not UTF-8 mode */
1587 for (i = 1; i <= min; i++)
1589 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1591 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1595 /* If max == min we can continue with the main loop without the
1598 if (min == max) continue;
1600 /* If minimizing, keep testing the rest of the expression and advancing
1601 the pointer while it matches the class. */
1609 for (fi = min;; fi++)
1611 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1613 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1614 GETCHARINC(c, eptr);
1617 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1621 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1627 /* Not UTF-8 mode */
1629 for (fi = min;; fi++)
1631 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1633 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1635 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1638 /* Control never gets here */
1641 /* If maximizing, find the longest possible run, then work backwards. */
1651 for (i = min; i < max; i++)
1654 if (eptr >= md->end_subject) break;
1655 GETCHARLEN(c, eptr, len);
1658 if (op == OP_CLASS) break;
1662 if ((data[c/8] & (1 << (c&7))) == 0) break;
1668 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1670 if (eptr-- == pp) break; /* Stop if tried at original pos */
1676 /* Not UTF-8 mode */
1678 for (i = min; i < max; i++)
1680 if (eptr >= md->end_subject) break;
1682 if ((data[c/8] & (1 << (c&7))) == 0) break;
1687 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1689 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1693 RRETURN(MATCH_NOMATCH);
1696 /* Control never gets here */
1699 /* Match an extended character class. This opcode is encountered only
1700 in UTF-8 mode, because that's the only time it is compiled. */
1705 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1706 ecode += GET(ecode, 1); /* Advance past the item */
1716 c = *ecode++ - OP_CRSTAR;
1717 minimize = (c & 1) != 0;
1718 min = rep_min[c]; /* Pick up values from tables; */
1719 max = rep_max[c]; /* zero for max => infinity */
1720 if (max == 0) max = INT_MAX;
1725 minimize = (*ecode == OP_CRMINRANGE);
1726 min = GET2(ecode, 1);
1727 max = GET2(ecode, 3);
1728 if (max == 0) max = INT_MAX;
1732 default: /* No repeat follows */
1737 /* First, ensure the minimum number of matches are present. */
1739 for (i = 1; i <= min; i++)
1741 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1742 GETCHARINC(c, eptr);
1743 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1746 /* If max == min we can continue with the main loop without the
1749 if (min == max) continue;
1751 /* If minimizing, keep testing the rest of the expression and advancing
1752 the pointer while it matches the class. */
1756 for (fi = min;; fi++)
1758 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1759 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1760 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1761 GETCHARINC(c, eptr);
1762 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1764 /* Control never gets here */
1767 /* If maximizing, find the longest possible run, then work backwards. */
1772 for (i = min; i < max; i++)
1775 if (eptr >= md->end_subject) break;
1776 GETCHARLEN(c, eptr, len);
1777 if (!_pcre_xclass(c, data)) break;
1782 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1784 if (eptr-- == pp) break; /* Stop if tried at original pos */
1787 RRETURN(MATCH_NOMATCH);
1790 /* Control never gets here */
1792 #endif /* End of XCLASS */
1794 /* Match a single character, casefully */
1802 GETCHARLEN(fc, ecode, length);
1803 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1804 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1809 /* Non-UTF-8 mode */
1811 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1812 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1817 /* Match a single character, caselessly */
1825 GETCHARLEN(fc, ecode, length);
1827 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1829 /* If the pattern character's value is < 128, we have only one byte, and
1830 can use the fast lookup table. */
1834 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1837 /* Otherwise we must pick up the subject character */
1842 GETCHARINC(dc, eptr);
1845 /* If we have Unicode property support, we can use it to test the other
1846 case of the character, if there is one. The result of ucp_findchar() is
1847 < 0 if the char isn't found, and othercase is returned as zero if there
1855 if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
1857 RRETURN(MATCH_NOMATCH);
1862 #endif /* SUPPORT_UTF8 */
1864 /* Non-UTF-8 mode */
1866 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1867 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1872 /* Match a single character repeatedly; different opcodes share code. */
1875 min = max = GET2(ecode, 1);
1882 max = GET2(ecode, 1);
1883 minimize = *ecode == OP_MINUPTO;
1893 c = *ecode++ - OP_STAR;
1894 minimize = (c & 1) != 0;
1895 min = rep_min[c]; /* Pick up values from tables; */
1896 max = rep_max[c]; /* zero for max => infinity */
1897 if (max == 0) max = INT_MAX;
1899 /* Common code for all repeated single-character matches. We can give
1900 up quickly if there are fewer than the minimum number of characters left in
1909 GETCHARLEN(fc, ecode, length);
1910 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1913 /* Handle multibyte character matching specially here. There is
1914 support for caseless matching if UCP support is present. */
1924 if ((ims & PCRE_CASELESS) != 0 &&
1925 ucp_findchar(fc, &chartype, &othercase) >= 0 &&
1927 oclength = _pcre_ord2utf8(othercase, occhars);
1928 #endif /* SUPPORT_UCP */
1930 for (i = 1; i <= min; i++)
1932 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1933 /* Need braces because of following else */
1934 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1937 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1942 if (min == max) continue;
1946 for (fi = min;; fi++)
1948 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1950 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1951 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1952 /* Need braces because of following else */
1953 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1956 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1960 /* Control never gets here */
1965 for (i = min; i < max; i++)
1967 if (eptr > md->end_subject - length) break;
1968 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1969 else if (oclength == 0) break;
1972 if (memcmp(eptr, occhars, oclength) != 0) break;
1978 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1979 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1982 RRETURN(MATCH_NOMATCH);
1984 /* Control never gets here */
1987 /* If the length of a UTF-8 character is 1, we fall through here, and
1988 obey the code as for non-UTF-8 characters below, though in this case the
1989 value of fc will always be < 128. */
1992 #endif /* SUPPORT_UTF8 */
1994 /* When not in UTF-8 mode, load a single-byte character. */
1996 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2000 /* The value of fc at this point is always less than 256, though we may or
2001 may not be in UTF-8 mode. The code is duplicated for the caseless and
2002 caseful cases, for speed, since matching characters is likely to be quite
2003 common. First, ensure the minimum number of matches are present. If min =
2004 max, continue at the same level without recursing. Otherwise, if
2005 minimizing, keep trying the rest of the expression and advancing one
2006 matching character if failing, up to the maximum. Alternatively, if
2007 maximizing, find the maximum number of characters and work backwards. */
2009 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2012 if ((ims & PCRE_CASELESS) != 0)
2015 for (i = 1; i <= min; i++)
2016 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2017 if (min == max) continue;
2020 for (fi = min;; fi++)
2022 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2024 if (fi >= max || eptr >= md->end_subject ||
2025 fc != md->lcc[*eptr++])
2026 RRETURN(MATCH_NOMATCH);
2028 /* Control never gets here */
2033 for (i = min; i < max; i++)
2035 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2040 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2044 RRETURN(MATCH_NOMATCH);
2046 /* Control never gets here */
2049 /* Caseful comparisons (includes all multi-byte characters) */
2053 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2054 if (min == max) continue;
2057 for (fi = min;; fi++)
2059 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2060 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2061 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2062 RRETURN(MATCH_NOMATCH);
2064 /* Control never gets here */
2069 for (i = min; i < max; i++)
2071 if (eptr >= md->end_subject || fc != *eptr) break;
2076 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2078 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2080 RRETURN(MATCH_NOMATCH);
2083 /* Control never gets here */
2085 /* Match a negated single one-byte character. The character we are
2086 checking can be multibyte. */
2089 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2091 GETCHARINCTEST(c, eptr);
2092 if ((ims & PCRE_CASELESS) != 0)
2098 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2102 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2106 /* Match a negated single one-byte character repeatedly. This is almost a
2107 repeat of the code for a repeated single character, but I haven't found a
2108 nice way of commoning these up that doesn't require a test of the
2109 positive/negative option for each character match. Maybe that wouldn't add
2110 very much to the time taken, but character matching *is* what this is all
2114 min = max = GET2(ecode, 1);
2121 max = GET2(ecode, 1);
2122 minimize = *ecode == OP_NOTMINUPTO;
2131 case OP_NOTMINQUERY:
2132 c = *ecode++ - OP_NOTSTAR;
2133 minimize = (c & 1) != 0;
2134 min = rep_min[c]; /* Pick up values from tables; */
2135 max = rep_max[c]; /* zero for max => infinity */
2136 if (max == 0) max = INT_MAX;
2138 /* Common code for all repeated single-byte matches. We can give up quickly
2139 if there are fewer than the minimum number of bytes left in the
2143 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2146 /* The code is duplicated for the caseless and caseful cases, for speed,
2147 since matching characters is likely to be quite common. First, ensure the
2148 minimum number of matches are present. If min = max, continue at the same
2149 level without recursing. Otherwise, if minimizing, keep trying the rest of
2150 the expression and advancing one matching character if failing, up to the
2151 maximum. Alternatively, if maximizing, find the maximum number of
2152 characters and work backwards. */
2154 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2157 if ((ims & PCRE_CASELESS) != 0)
2166 for (i = 1; i <= min; i++)
2168 GETCHARINC(d, eptr);
2169 if (d < 256) d = md->lcc[d];
2170 if (fc == d) RRETURN(MATCH_NOMATCH);
2176 /* Not UTF-8 mode */
2178 for (i = 1; i <= min; i++)
2179 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2182 if (min == max) continue;
2191 for (fi = min;; fi++)
2193 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2195 GETCHARINC(d, eptr);
2196 if (d < 256) d = md->lcc[d];
2197 if (fi >= max || eptr >= md->end_subject || fc == d)
2198 RRETURN(MATCH_NOMATCH);
2203 /* Not UTF-8 mode */
2205 for (fi = min;; fi++)
2207 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2208 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2209 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2210 RRETURN(MATCH_NOMATCH);
2213 /* Control never gets here */
2227 for (i = min; i < max; i++)
2230 if (eptr >= md->end_subject) break;
2231 GETCHARLEN(d, eptr, len);
2232 if (d < 256) d = md->lcc[d];
2238 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2239 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2240 if (eptr-- == pp) break; /* Stop if tried at original pos */
2246 /* Not UTF-8 mode */
2248 for (i = min; i < max; i++)
2250 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2255 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2256 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2261 RRETURN(MATCH_NOMATCH);
2263 /* Control never gets here */
2266 /* Caseful comparisons */
2275 for (i = 1; i <= min; i++)
2277 GETCHARINC(d, eptr);
2278 if (fc == d) RRETURN(MATCH_NOMATCH);
2283 /* Not UTF-8 mode */
2285 for (i = 1; i <= min; i++)
2286 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2289 if (min == max) continue;
2298 for (fi = min;; fi++)
2300 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2301 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2302 GETCHARINC(d, eptr);
2303 if (fi >= max || eptr >= md->end_subject || fc == d)
2304 RRETURN(MATCH_NOMATCH);
2309 /* Not UTF-8 mode */
2311 for (fi = min;; fi++)
2313 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2314 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2315 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2316 RRETURN(MATCH_NOMATCH);
2319 /* Control never gets here */
2333 for (i = min; i < max; i++)
2336 if (eptr >= md->end_subject) break;
2337 GETCHARLEN(d, eptr, len);
2343 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2344 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2345 if (eptr-- == pp) break; /* Stop if tried at original pos */
2351 /* Not UTF-8 mode */
2353 for (i = min; i < max; i++)
2355 if (eptr >= md->end_subject || fc == *eptr) break;
2360 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2361 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2366 RRETURN(MATCH_NOMATCH);
2369 /* Control never gets here */
2371 /* Match a single character type repeatedly; several different opcodes
2372 share code. This is very similar to the code for single characters, but we
2373 repeat it in the interests of efficiency. */
2376 min = max = GET2(ecode, 1);
2382 case OP_TYPEMINUPTO:
2384 max = GET2(ecode, 1);
2385 minimize = *ecode == OP_TYPEMINUPTO;
2390 case OP_TYPEMINSTAR:
2392 case OP_TYPEMINPLUS:
2394 case OP_TYPEMINQUERY:
2395 c = *ecode++ - OP_TYPESTAR;
2396 minimize = (c & 1) != 0;
2397 min = rep_min[c]; /* Pick up values from tables; */
2398 max = rep_max[c]; /* zero for max => infinity */
2399 if (max == 0) max = INT_MAX;
2401 /* Common code for all repeated single character type matches. Note that
2402 in UTF-8 mode, '.' matches a character of any length, but for the other
2403 character types, the valid characters are all one-byte long. */
2406 ctype = *ecode++; /* Code for the character type */
2409 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2411 prop_fail_result = ctype == OP_NOTPROP;
2412 prop_type = *ecode++;
2413 if (prop_type >= 128)
2415 prop_test_against = prop_type - 128;
2416 prop_test_variable = &prop_category;
2420 prop_test_against = prop_type;
2421 prop_test_variable = &prop_chartype;
2424 else prop_type = -1;
2427 /* First, ensure the minimum number of matches are present. Use inline
2428 code for maximizing the speed, and do the type test once at the start
2429 (i.e. keep it out of the loop). Also we can test that there are at least
2430 the minimum number of bytes before we start. This isn't as effective in
2431 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2432 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2433 and single-bytes. */
2435 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2441 for (i = 1; i <= min; i++)
2443 GETCHARINC(c, eptr);
2444 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2445 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2446 RRETURN(MATCH_NOMATCH);
2450 /* Match extended Unicode sequences. We will get here only if the
2451 support is in the binary; otherwise a compile-time error occurs. */
2453 else if (ctype == OP_EXTUNI)
2455 for (i = 1; i <= min; i++)
2457 GETCHARINCTEST(c, eptr);
2458 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2459 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2460 while (eptr < md->end_subject)
2463 if (!utf8) c = *eptr; else
2465 GETCHARLEN(c, eptr, len);
2467 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2468 if (prop_category != ucp_M) break;
2475 #endif /* SUPPORT_UCP */
2477 /* Handle all other cases when the coding is UTF-8 */
2480 if (utf8) switch(ctype)
2483 for (i = 1; i <= min; i++)
2485 if (eptr >= md->end_subject ||
2486 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
2487 RRETURN(MATCH_NOMATCH);
2488 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2497 for (i = 1; i <= min; i++)
2499 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2500 GETCHARINC(c, eptr);
2501 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2502 RRETURN(MATCH_NOMATCH);
2507 for (i = 1; i <= min; i++)
2509 if (eptr >= md->end_subject ||
2510 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2511 RRETURN(MATCH_NOMATCH);
2512 /* No need to skip more bytes - we know it's a 1-byte character */
2516 case OP_NOT_WHITESPACE:
2517 for (i = 1; i <= min; i++)
2519 if (eptr >= md->end_subject ||
2520 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2521 RRETURN(MATCH_NOMATCH);
2522 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2527 for (i = 1; i <= min; i++)
2529 if (eptr >= md->end_subject ||
2530 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2531 RRETURN(MATCH_NOMATCH);
2532 /* No need to skip more bytes - we know it's a 1-byte character */
2536 case OP_NOT_WORDCHAR:
2537 for (i = 1; i <= min; i++)
2539 if (eptr >= md->end_subject ||
2540 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2541 RRETURN(MATCH_NOMATCH);
2542 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2547 for (i = 1; i <= min; i++)
2549 if (eptr >= md->end_subject ||
2550 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2551 RRETURN(MATCH_NOMATCH);
2552 /* No need to skip more bytes - we know it's a 1-byte character */
2557 RRETURN(PCRE_ERROR_INTERNAL);
2558 } /* End switch(ctype) */
2561 #endif /* SUPPORT_UTF8 */
2563 /* Code for the non-UTF-8 case for minimum matching of operators other
2564 than OP_PROP and OP_NOTPROP. */
2569 if ((ims & PCRE_DOTALL) == 0)
2571 for (i = 1; i <= min; i++)
2572 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
2582 for (i = 1; i <= min; i++)
2583 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2587 for (i = 1; i <= min; i++)
2588 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2591 case OP_NOT_WHITESPACE:
2592 for (i = 1; i <= min; i++)
2593 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2597 for (i = 1; i <= min; i++)
2598 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2601 case OP_NOT_WORDCHAR:
2602 for (i = 1; i <= min; i++)
2603 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2604 RRETURN(MATCH_NOMATCH);
2608 for (i = 1; i <= min; i++)
2609 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2610 RRETURN(MATCH_NOMATCH);
2614 RRETURN(PCRE_ERROR_INTERNAL);
2618 /* If min = max, continue at the same level without recursing */
2620 if (min == max) continue;
2622 /* If minimizing, we have to test the rest of the pattern before each
2623 subsequent match. Again, separate the UTF-8 case for speed, and also
2624 separate the UCP cases. */
2631 for (fi = min;; fi++)
2633 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2634 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2635 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2636 GETCHARINC(c, eptr);
2637 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2638 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2639 RRETURN(MATCH_NOMATCH);
2643 /* Match extended Unicode sequences. We will get here only if the
2644 support is in the binary; otherwise a compile-time error occurs. */
2646 else if (ctype == OP_EXTUNI)
2648 for (fi = min;; fi++)
2650 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2652 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2653 GETCHARINCTEST(c, eptr);
2654 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2655 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2656 while (eptr < md->end_subject)
2659 if (!utf8) c = *eptr; else
2661 GETCHARLEN(c, eptr, len);
2663 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2664 if (prop_category != ucp_M) break;
2671 #endif /* SUPPORT_UCP */
2677 for (fi = min;; fi++)
2679 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2680 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2681 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2683 GETCHARINC(c, eptr);
2687 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2694 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2695 RRETURN(MATCH_NOMATCH);
2699 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2700 RRETURN(MATCH_NOMATCH);
2703 case OP_NOT_WHITESPACE:
2704 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2705 RRETURN(MATCH_NOMATCH);
2709 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2710 RRETURN(MATCH_NOMATCH);
2713 case OP_NOT_WORDCHAR:
2714 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2715 RRETURN(MATCH_NOMATCH);
2719 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2720 RRETURN(MATCH_NOMATCH);
2724 RRETURN(PCRE_ERROR_INTERNAL);
2730 /* Not UTF-8 mode */
2732 for (fi = min;; fi++)
2734 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2735 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2736 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2741 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2748 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2752 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2755 case OP_NOT_WHITESPACE:
2756 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2760 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2763 case OP_NOT_WORDCHAR:
2764 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2768 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2772 RRETURN(PCRE_ERROR_INTERNAL);
2776 /* Control never gets here */
2779 /* If maximizing it is worth using inline code for speed, doing the type
2780 test once at the start (i.e. keep it out of the loop). Again, keep the
2781 UTF-8 and UCP stuff separate. */
2785 pp = eptr; /* Remember where we started */
2790 for (i = min; i < max; i++)
2793 if (eptr >= md->end_subject) break;
2794 GETCHARLEN(c, eptr, len);
2795 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2796 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2801 /* eptr is now past the end of the maximum run */
2805 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2806 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2807 if (eptr-- == pp) break; /* Stop if tried at original pos */
2812 /* Match extended Unicode sequences. We will get here only if the
2813 support is in the binary; otherwise a compile-time error occurs. */
2815 else if (ctype == OP_EXTUNI)
2817 for (i = min; i < max; i++)
2819 if (eptr >= md->end_subject) break;
2820 GETCHARINCTEST(c, eptr);
2821 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2822 if (prop_category == ucp_M) break;
2823 while (eptr < md->end_subject)
2826 if (!utf8) c = *eptr; else
2828 GETCHARLEN(c, eptr, len);
2830 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2831 if (prop_category != ucp_M) break;
2836 /* eptr is now past the end of the maximum run */
2840 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2841 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2842 if (eptr-- == pp) break; /* Stop if tried at original pos */
2843 for (;;) /* Move back over one extended */
2847 if (!utf8) c = *eptr; else
2849 GETCHARLEN(c, eptr, len);
2851 prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2852 if (prop_category != ucp_M) break;
2859 #endif /* SUPPORT_UCP */
2870 /* Special code is required for UTF8, but when the maximum is unlimited
2871 we don't need it, so we repeat the non-UTF8 code. This is probably
2872 worth it, because .* is quite a common idiom. */
2876 if ((ims & PCRE_DOTALL) == 0)
2878 for (i = min; i < max; i++)
2880 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2882 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2887 for (i = min; i < max; i++)
2890 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2895 /* Handle unlimited UTF-8 repeat */
2899 if ((ims & PCRE_DOTALL) == 0)
2901 for (i = min; i < max; i++)
2903 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2911 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2917 /* The byte case is the same as non-UTF8 */
2921 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2926 for (i = min; i < max; i++)
2929 if (eptr >= md->end_subject) break;
2930 GETCHARLEN(c, eptr, len);
2931 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
2937 for (i = min; i < max; i++)
2940 if (eptr >= md->end_subject) break;
2941 GETCHARLEN(c, eptr, len);
2942 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
2947 case OP_NOT_WHITESPACE:
2948 for (i = min; i < max; i++)
2951 if (eptr >= md->end_subject) break;
2952 GETCHARLEN(c, eptr, len);
2953 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
2959 for (i = min; i < max; i++)
2962 if (eptr >= md->end_subject) break;
2963 GETCHARLEN(c, eptr, len);
2964 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
2969 case OP_NOT_WORDCHAR:
2970 for (i = min; i < max; i++)
2973 if (eptr >= md->end_subject) break;
2974 GETCHARLEN(c, eptr, len);
2975 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
2981 for (i = min; i < max; i++)
2984 if (eptr >= md->end_subject) break;
2985 GETCHARLEN(c, eptr, len);
2986 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
2992 RRETURN(PCRE_ERROR_INTERNAL);
2995 /* eptr is now past the end of the maximum run */
2999 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3001 if (eptr-- == pp) break; /* Stop if tried at original pos */
3008 /* Not UTF-8 mode */
3013 if ((ims & PCRE_DOTALL) == 0)
3015 for (i = min; i < max; i++)
3017 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3022 /* For DOTALL case, fall through and treat as \C */
3026 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3031 for (i = min; i < max; i++)
3033 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3040 for (i = min; i < max; i++)
3042 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3048 case OP_NOT_WHITESPACE:
3049 for (i = min; i < max; i++)
3051 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3058 for (i = min; i < max; i++)
3060 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3066 case OP_NOT_WORDCHAR:
3067 for (i = min; i < max; i++)
3069 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3076 for (i = min; i < max; i++)
3078 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3085 RRETURN(PCRE_ERROR_INTERNAL);
3088 /* eptr is now past the end of the maximum run */
3092 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3098 /* Get here if we can't make it match with any permitted repetitions */
3100 RRETURN(MATCH_NOMATCH);
3102 /* Control never gets here */
3104 /* There's been some horrible disaster. Since all codes > OP_BRA are
3105 for capturing brackets, and there shouldn't be any gaps between 0 and
3106 OP_BRA, arrival here can only mean there is something seriously wrong
3107 in the code above or the OP_xxx definitions. */
3110 DPRINTF(("Unknown opcode %d\n", *ecode));
3111 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3114 /* Do not stick any code in here without much thought; it is assumed
3115 that "continue" in the code above comes out to here to repeat the main
3118 } /* End of main loop */
3119 /* Control never reaches here */
3123 /***************************************************************************
3124 ****************************************************************************
3125 RECURSION IN THE match() FUNCTION
3127 Undefine all the macros that were defined above to handle this. */
3145 #undef new_recursive
3161 #undef save_capture_last
3171 /* These two are defined as macros in both cases */
3176 /***************************************************************************
3177 ***************************************************************************/
3181 /*************************************************
3182 * Execute a Regular Expression *
3183 *************************************************/
3185 /* This function applies a compiled re to a subject string and picks out
3186 portions of the string if it matches. Two elements in the vector are set for
3187 each substring: the offsets to the start and end of the substring.
3190 argument_re points to the compiled expression
3191 extra_data points to extra data or is NULL
3192 subject points to the subject string
3193 length length of subject string (may contain binary zeros)
3194 start_offset where to start in the subject string
3196 offsets points to a vector of ints to be filled in with offsets
3197 offsetcount the number of elements in the vector
3199 Returns: > 0 => success; value is the number of elements filled in
3200 = 0 => success, but offsets is not big enough
3201 -1 => failed to match
3202 < -1 => some kind of unexpected problem
3206 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3207 const char *subject, int length, int start_offset, int options, int *offsets,
3210 int rc, resetcount, ocount;
3211 int first_byte = -1;
3214 unsigned long int ims = 0;
3215 BOOL using_temporary_offsets = FALSE;
3219 BOOL first_byte_caseless = FALSE;
3220 BOOL req_byte_caseless = FALSE;
3221 match_data match_block;
3222 const uschar *tables;
3223 const uschar *start_bits = NULL;
3224 const uschar *start_match = (const uschar *)subject + start_offset;
3225 const uschar *end_subject;
3226 const uschar *req_byte_ptr = start_match - 1;
3228 pcre_study_data internal_study;
3229 const pcre_study_data *study;
3231 real_pcre internal_re;
3232 const real_pcre *external_re = (const real_pcre *)argument_re;
3233 const real_pcre *re = external_re;
3235 /* Plausibility checks */
3237 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3238 if (re == NULL || subject == NULL ||
3239 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3240 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3242 /* Fish out the optional data from the extra_data structure, first setting
3243 the default values. */
3246 match_block.match_limit = MATCH_LIMIT;
3247 match_block.callout_data = NULL;
3249 /* The table pointer is always in native byte order. */
3251 tables = external_re->tables;
3253 if (extra_data != NULL)
3255 register unsigned int flags = extra_data->flags;
3256 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3257 study = (const pcre_study_data *)extra_data->study_data;
3258 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3259 match_block.match_limit = extra_data->match_limit;
3260 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3261 match_block.callout_data = extra_data->callout_data;
3262 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3265 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3266 is a feature that makes it possible to save compiled regex and re-use them
3267 in other programs later. */
3269 if (tables == NULL) tables = _pcre_default_tables;
3271 /* Check that the first field in the block is the magic number. If it is not,
3272 test for a regex that was compiled on a host of opposite endianness. If this is
3273 the case, flipped values are put in internal_re and internal_study if there was
3276 if (re->magic_number != MAGIC_NUMBER)
3278 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3279 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3280 if (study != NULL) study = &internal_study;
3283 /* Set up other data */
3285 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3286 startline = (re->options & PCRE_STARTLINE) != 0;
3287 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3289 /* The code starts after the real_pcre block and the capture name table. */
3291 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
3292 re->name_count * re->name_entry_size;
3294 match_block.start_subject = (const uschar *)subject;
3295 match_block.start_offset = start_offset;
3296 match_block.end_subject = match_block.start_subject + length;
3297 end_subject = match_block.end_subject;
3299 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3300 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
3302 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3303 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3304 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
3305 match_block.partial = (options & PCRE_PARTIAL) != 0;
3306 match_block.hitend = FALSE;
3308 match_block.recursive = NULL; /* No recursion at top level */
3310 match_block.lcc = tables + lcc_offset;
3311 match_block.ctypes = tables + ctypes_offset;
3313 /* Partial matching is supported only for a restricted set of regexes at the
3316 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
3317 return PCRE_ERROR_BADPARTIAL;
3319 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3320 back the character offset. */
3323 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3325 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3326 return PCRE_ERROR_BADUTF8;
3327 if (start_offset > 0 && start_offset < length)
3329 int tb = ((uschar *)subject)[start_offset];
3333 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3339 /* The ims options can vary during the matching as a result of the presence
3340 of (?ims) items in the pattern. They are kept in a local variable so that
3341 restoring at the exit of a group is easy. */
3343 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3345 /* If the expression has got more back references than the offsets supplied can
3346 hold, we get a temporary chunk of working store to use during the matching.
3347 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3350 ocount = offsetcount - (offsetcount % 3);
3352 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3354 ocount = re->top_backref * 3 + 3;
3355 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3356 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3357 using_temporary_offsets = TRUE;
3358 DPRINTF(("Got memory to hold back references\n"));
3360 else match_block.offset_vector = offsets;
3362 match_block.offset_end = ocount;
3363 match_block.offset_max = (2*ocount)/3;
3364 match_block.offset_overflow = FALSE;
3365 match_block.capture_last = -1;
3367 /* Compute the minimum number of offsets that we need to reset each time. Doing
3368 this makes a huge difference to execution time when there aren't many brackets
3371 resetcount = 2 + re->top_bracket * 2;
3372 if (resetcount > offsetcount) resetcount = ocount;
3374 /* Reset the working variable associated with each extraction. These should
3375 never be used unless previously set, but they get saved and restored, and so we
3376 initialize them to avoid reading uninitialized locations. */
3378 if (match_block.offset_vector != NULL)
3380 register int *iptr = match_block.offset_vector + ocount;
3381 register int *iend = iptr - resetcount/2 + 1;
3382 while (--iptr >= iend) *iptr = -1;
3385 /* Set up the first character to match, if available. The first_byte value is
3386 never set for an anchored regular expression, but the anchoring may be forced
3387 at run time, so we have to test for anchoring. The first char may be unset for
3388 an unanchored pattern, of course. If there's no first char and the pattern was
3389 studied, there may be a bitmap of possible first characters. */
3393 if ((re->options & PCRE_FIRSTSET) != 0)
3395 first_byte = re->first_byte & 255;
3396 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3397 first_byte = match_block.lcc[first_byte];
3400 if (!startline && study != NULL &&
3401 (study->options & PCRE_STUDY_MAPPED) != 0)
3402 start_bits = study->start_bits;
3405 /* For anchored or unanchored matches, there may be a "last known required
3408 if ((re->options & PCRE_REQCHSET) != 0)
3410 req_byte = re->req_byte & 255;
3411 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3412 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3415 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3416 the loop runs just once. */
3420 const uschar *save_end_subject = end_subject;
3422 /* Reset the maximum number of extractions we might see. */
3424 if (match_block.offset_vector != NULL)
3426 register int *iptr = match_block.offset_vector;
3427 register int *iend = iptr + resetcount;
3428 while (iptr < iend) *iptr++ = -1;
3431 /* Advance to a unique first char if possible. If firstline is TRUE, the
3432 start of the match is constrained to the first line of a multiline string.
3433 Implement this by temporarily adjusting end_subject so that we stop scanning
3434 at a newline. If the match fails at the newline, later code breaks this loop.
3439 const uschar *t = start_match;
3440 while (t < save_end_subject && *t != '\n') t++;
3444 /* Now test for a unique first byte */
3446 if (first_byte >= 0)
3448 if (first_byte_caseless)
3449 while (start_match < end_subject &&
3450 match_block.lcc[*start_match] != first_byte)
3453 while (start_match < end_subject && *start_match != first_byte)
3457 /* Or to just after \n for a multiline match if possible */
3461 if (start_match > match_block.start_subject + start_offset)
3463 while (start_match < end_subject && start_match[-1] != NEWLINE)
3468 /* Or to a non-unique first char after study */
3470 else if (start_bits != NULL)
3472 while (start_match < end_subject)
3474 register unsigned int c = *start_match;
3475 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3479 /* Restore fudged end_subject */
3481 end_subject = save_end_subject;
3483 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3484 printf(">>>> Match against: ");
3485 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3489 /* If req_byte is set, we know that that character must appear in the subject
3490 for the match to succeed. If the first character is set, req_byte must be
3491 later in the subject; otherwise the test starts at the match point. This
3492 optimization can save a huge amount of backtracking in patterns with nested
3493 unlimited repeats that aren't going to match. Writing separate code for
3494 cased/caseless versions makes it go faster, as does using an autoincrement
3495 and backing off on a match.
3497 HOWEVER: when the subject string is very, very long, searching to its end can
3498 take a long time, and give bad performance on quite ordinary patterns. This
3499 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3500 don't do this when the string is sufficiently long.
3502 ALSO: this processing is disabled when partial matching is requested.
3505 if (req_byte >= 0 &&
3506 end_subject - start_match < REQ_BYTE_MAX &&
3507 !match_block.partial)
3509 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
3511 /* We don't need to repeat the search if we haven't yet reached the
3512 place we found it at last time. */
3514 if (p > req_byte_ptr)
3516 if (req_byte_caseless)
3518 while (p < end_subject)
3520 register int pp = *p++;
3521 if (pp == req_byte || pp == req_byte2) { p--; break; }
3526 while (p < end_subject)
3528 if (*p++ == req_byte) { p--; break; }
3532 /* If we can't find the required character, break the matching loop */
3534 if (p >= end_subject) break;
3536 /* If we have found the required character, save the point where we
3537 found it, so that we don't search again next time round the loop if
3538 the start hasn't passed this character yet. */
3544 /* When a match occurs, substrings will be set for all internal extractions;
3545 we just need to set up the whole thing as substring 0 before returning. If
3546 there were too many extractions, set the return code to zero. In the case
3547 where we had to get some local store to hold offsets for backreferences, copy
3548 those back references that we can. In this case there need not be overflow
3549 if certain parts of the pattern were not used. */
3551 match_block.start_match = start_match;
3552 match_block.match_call_count = 0;
3554 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
3557 /* When the result is no match, if the subject's first character was a
3558 newline and the PCRE_FIRSTLINE option is set, break (which will return
3559 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3560 newline in the subject. Otherwise, advance the pointer to the next character
3561 and continue - but the continuation will actually happen only when the
3562 pattern is not anchored. */
3564 if (rc == MATCH_NOMATCH)
3566 if (firstline && *start_match == NEWLINE) break;
3569 if (match_block.utf8)
3570 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3576 if (rc != MATCH_MATCH)
3578 DPRINTF((">>>> error: returning %d\n", rc));
3582 /* We have a match! Copy the offset information from temporary store if
3585 if (using_temporary_offsets)
3587 if (offsetcount >= 4)
3589 memcpy(offsets + 2, match_block.offset_vector + 2,
3590 (offsetcount - 2) * sizeof(int));
3591 DPRINTF(("Copied offsets from temporary memory\n"));
3593 if (match_block.end_offset_top > offsetcount)
3594 match_block.offset_overflow = TRUE;
3596 DPRINTF(("Freeing temporary memory\n"));
3597 (pcre_free)(match_block.offset_vector);
3600 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3602 if (offsetcount < 2) rc = 0; else
3604 offsets[0] = start_match - match_block.start_subject;
3605 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3608 DPRINTF((">>>> returning %d\n", rc));
3612 /* This "while" is the end of the "do" above */
3614 while (!anchored && start_match <= end_subject);
3616 if (using_temporary_offsets)
3618 DPRINTF(("Freeing temporary memory\n"));
3619 (pcre_free)(match_block.offset_vector);
3622 if (match_block.partial && match_block.hitend)
3624 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3625 return PCRE_ERROR_PARTIAL;
3629 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3630 return PCRE_ERROR_NOMATCH;
3634 /* End of pcre_exec.c */