1 /* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2006 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains pcre_exec(), the externally visible function that does
44 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
45 possible. There are also some static supporting functions. */
47 #define NLBLOCK md /* Block containing newline information */
48 #define PSSTART start_subject /* Field containing processed string start */
49 #define PSEND end_subject /* Field containing processed string end */
51 #include "pcre_internal.h"
53 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
54 obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
56 #define EPTR_WORK_SIZE (1000)
58 /* Flag bits for the match() function */
60 #define match_condassert 0x01 /* Called to check a condition assertion */
61 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
62 #define match_tail_recursed 0x04 /* Tail recursive call */
64 /* Non-error returns from the match() function. Error returns are externally
65 defined PCRE_ERROR_xxx codes, which are all negative. */
68 #define MATCH_NOMATCH 0
70 /* Maximum number of ints of offset to save on the stack for recursive calls.
71 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
72 because the offset vector is always a multiple of 3 long. */
74 #define REC_STACK_SAVE_MAX 30
76 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
78 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
79 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
84 /*************************************************
85 * Debugging function to print chars *
86 *************************************************/
88 /* Print a sequence of chars in printable format, stopping at the end of the
89 subject if the requested.
92 p points to characters
93 length number to print
94 is_subject TRUE if printing from within md->start_subject
95 md pointer to matching data block, if is_subject is TRUE
101 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
104 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
106 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
112 /*************************************************
113 * Match a back-reference *
114 *************************************************/
116 /* If a back reference hasn't been set, the length that is passed is greater
117 than the number of characters left in the string, so the match fails.
120 offset index into the offset vector
121 eptr points into the subject
122 length length to be matched
123 md points to match data block
126 Returns: TRUE if matched
130 match_ref(int offset, register USPTR eptr, int length, match_data *md,
131 unsigned long int ims)
133 USPTR p = md->start_subject + md->offset_vector[offset];
136 if (eptr >= md->end_subject)
137 printf("matching subject <null>");
140 printf("matching subject ");
141 pchars(eptr, length, TRUE, md);
143 printf(" against backref ");
144 pchars(p, length, FALSE, md);
148 /* Always fail if not enough characters left */
150 if (length > md->end_subject - eptr) return FALSE;
152 /* Separate the caselesss case for speed */
154 if ((ims & PCRE_CASELESS) != 0)
157 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
160 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
167 /***************************************************************************
168 ****************************************************************************
169 RECURSION IN THE match() FUNCTION
171 The match() function is highly recursive, though not every recursive call
172 increases the recursive depth. Nevertheless, some regular expressions can cause
173 it to recurse to a great depth. I was writing for Unix, so I just let it call
174 itself recursively. This uses the stack for saving everything that has to be
175 saved for a recursive call. On Unix, the stack can be large, and this works
178 It turns out that on some non-Unix-like systems there are problems with
179 programs that use a lot of stack. (This despite the fact that every last chip
180 has oodles of memory these days, and techniques for extending the stack have
181 been known for decades.) So....
183 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
184 calls by keeping local variables that need to be preserved in blocks of memory
185 obtained from malloc() instead instead of on the stack. Macros are used to
186 achieve this so that the actual code doesn't look very different to what it
188 ****************************************************************************
189 ***************************************************************************/
192 /* These versions of the macros use the stack, as normal. There are debugging
193 versions and production versions. */
196 #define REGISTER register
198 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
200 printf("match() called in line %d\n", __LINE__); \
201 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
202 printf("to line %d\n", __LINE__); \
204 #define RRETURN(ra) \
206 printf("match() returned %d from line %d ", ra, __LINE__); \
210 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
211 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
212 #define RRETURN(ra) return ra
218 /* These versions of the macros manage a private stack on the heap. Note
219 that the rd argument of RMATCH isn't actually used. It's the md argument of
220 match(), which never changes. */
224 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
226 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
227 if (setjmp(frame->Xwhere) == 0)\
229 newframe->Xeptr = ra;\
230 newframe->Xecode = rb;\
231 newframe->Xoffset_top = rc;\
232 newframe->Xims = re;\
233 newframe->Xeptrb = rf;\
234 newframe->Xflags = rg;\
235 newframe->Xrdepth = frame->Xrdepth + 1;\
236 newframe->Xprevframe = frame;\
238 DPRINTF(("restarting from line %d\n", __LINE__));\
243 DPRINTF(("longjumped back to line %d\n", __LINE__));\
244 frame = md->thisframe;\
245 rx = frame->Xresult;\
251 heapframe *newframe = frame;\
252 frame = newframe->Xprevframe;\
253 (pcre_stack_free)(newframe);\
256 frame->Xresult = ra;\
257 md->thisframe = frame;\
258 longjmp(frame->Xwhere, 1);\
264 /* Structure for remembering the local variables in a private frame */
266 typedef struct heapframe {
267 struct heapframe *Xprevframe;
269 /* Function arguments that may change */
272 const uschar *Xecode;
277 unsigned int Xrdepth;
279 /* Function local variables */
281 const uschar *Xcallpat;
282 const uschar *Xcharptr;
287 const uschar *Xsaved_eptr;
289 recursion_info Xnew_recursive;
295 unsigned long int Xoriginal_ims;
300 int Xprop_fail_result;
315 int Xsave_capture_last;
316 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
317 int Xstacksave[REC_STACK_SAVE_MAX];
321 /* Place to pass back result, and where to jump back to */
331 /***************************************************************************
332 ***************************************************************************/
336 /*************************************************
337 * Match from current position *
338 *************************************************/
340 /* This function is called recursively in many circumstances. Whenever it
341 returns a negative (error) response, the outer incarnation must also return the
344 Performance note: It might be tempting to extract commonly used fields from the
345 md structure (e.g. utf8, end_subject) into individual variables to improve
346 performance. Tests using gcc on a SPARC disproved this; in the first case, it
347 made performance worse.
350 eptr pointer to current character in subject
351 ecode pointer to current position in compiled code
352 offset_top current top pointer
353 md pointer to "static" info for the match
354 ims current /i, /m, and /s options
355 eptrb pointer to chain of blocks containing eptr at start of
356 brackets - for testing for empty matches
358 match_condassert - this is an assertion condition
359 match_cbegroup - this is the start of an unlimited repeat
360 group that can match an empty string
361 match_tail_recursed - this is a tail_recursed group
362 rdepth the recursion depth
364 Returns: MATCH_MATCH if matched ) these values are >= 0
365 MATCH_NOMATCH if failed to match )
366 a negative PCRE_ERROR_xxx value if aborted by an error condition
367 (e.g. stopped by repeated call or recursion limit)
371 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
372 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
373 int flags, unsigned int rdepth)
375 /* These variables do not need to be preserved over recursion in this function,
376 so they can be ordinary variables in all cases. Mark some of them with
377 "register" because they are used a lot in loops. */
379 register int rrc; /* Returns from recursive calls */
380 register int i; /* Used for loops not involving calls to RMATCH() */
381 register unsigned int c; /* Character values not kept over RMATCH() calls */
382 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
384 BOOL minimize, possessive; /* Quantifier options */
386 /* When recursion is not being used, all "local" variables that have to be
387 preserved over calls to RMATCH() are part of a "frame" which is obtained from
388 heap storage. Set up the top-level frame here; others are obtained from the
389 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
392 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
393 frame->Xprevframe = NULL; /* Marks the top level */
395 /* Copy in the original argument variables */
398 frame->Xecode = ecode;
399 frame->Xoffset_top = offset_top;
401 frame->Xeptrb = eptrb;
402 frame->Xflags = flags;
403 frame->Xrdepth = rdepth;
405 /* This is where control jumps back to to effect "recursion" */
409 /* Macros make the argument variables come from the current frame */
411 #define eptr frame->Xeptr
412 #define ecode frame->Xecode
413 #define offset_top frame->Xoffset_top
414 #define ims frame->Xims
415 #define eptrb frame->Xeptrb
416 #define flags frame->Xflags
417 #define rdepth frame->Xrdepth
419 /* Ditto for the local variables */
422 #define charptr frame->Xcharptr
424 #define callpat frame->Xcallpat
425 #define data frame->Xdata
426 #define next frame->Xnext
427 #define pp frame->Xpp
428 #define prev frame->Xprev
429 #define saved_eptr frame->Xsaved_eptr
431 #define new_recursive frame->Xnew_recursive
433 #define cur_is_word frame->Xcur_is_word
434 #define condition frame->Xcondition
435 #define prev_is_word frame->Xprev_is_word
437 #define original_ims frame->Xoriginal_ims
440 #define prop_type frame->Xprop_type
441 #define prop_value frame->Xprop_value
442 #define prop_fail_result frame->Xprop_fail_result
443 #define prop_category frame->Xprop_category
444 #define prop_chartype frame->Xprop_chartype
445 #define prop_script frame->Xprop_script
448 #define ctype frame->Xctype
449 #define fc frame->Xfc
450 #define fi frame->Xfi
451 #define length frame->Xlength
452 #define max frame->Xmax
453 #define min frame->Xmin
454 #define number frame->Xnumber
455 #define offset frame->Xoffset
456 #define op frame->Xop
457 #define save_capture_last frame->Xsave_capture_last
458 #define save_offset1 frame->Xsave_offset1
459 #define save_offset2 frame->Xsave_offset2
460 #define save_offset3 frame->Xsave_offset3
461 #define stacksave frame->Xstacksave
463 #define newptrb frame->Xnewptrb
465 /* When recursion is being used, local variables are allocated on the stack and
466 get preserved during recursion in the normal way. In this environment, fi and
467 i, and fc and c, can be the same variables. */
469 #else /* NO_RECURSE not defined */
474 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
475 const uschar *charptr; /* in small blocks of the code. My normal */
476 #endif /* style of coding would have declared */
477 const uschar *callpat; /* them within each of those blocks. */
478 const uschar *data; /* However, in order to accommodate the */
479 const uschar *next; /* version of this code that uses an */
480 USPTR pp; /* external "stack" implemented on the */
481 const uschar *prev; /* heap, it is easier to declare them all */
482 USPTR saved_eptr; /* here, so the declarations can be cut */
483 /* out in a block. The only declarations */
484 recursion_info new_recursive; /* within blocks below are for variables */
485 /* that do not have to be preserved over */
486 BOOL cur_is_word; /* a recursive call to RMATCH(). */
490 unsigned long int original_ims;
495 int prop_fail_result;
508 int save_capture_last;
509 int save_offset1, save_offset2, save_offset3;
510 int stacksave[REC_STACK_SAVE_MAX];
513 #endif /* NO_RECURSE */
515 /* These statements are here to stop the compiler complaining about unitialized
520 prop_fail_result = 0;
524 /* This label is used for tail recursion, which is used in a few cases even
525 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
526 used. Thanks to Ian Taylor for noticing this possibility and sending the
531 /* OK, now we can get on with the real code of the function. Recursive calls
532 are specified by the macro RMATCH and RRETURN is used to return. When
533 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
534 and a "return", respectively (possibly with some debugging if DEBUG is
535 defined). However, RMATCH isn't like a function call because it's quite a
536 complicated macro. It has to be used in one particular way. This shouldn't,
537 however, impact performance when true recursion is being used. */
539 /* First check that we haven't called match() too many times, or that we
540 haven't exceeded the recursive call limit. */
542 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
543 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
545 original_ims = ims; /* Save for resetting on ')' */
548 utf8 = md->utf8; /* Local copy of the flag */
553 /* At the start of a group with an unlimited repeat that may match an empty
554 string, the match_cbegroup flag is set. When this is the case, add the current
555 subject pointer to the chain of such remembered pointers, to be checked when we
556 hit the closing ket, in order to break infinite loops that match no characters.
557 When match() is called in other circumstances, don't add to the chain. If this
558 is a tail recursion, use a block from the workspace, as the one on the stack is
561 if ((flags & match_cbegroup) != 0)
564 if ((flags & match_tail_recursed) != 0)
566 if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
567 p = md->eptrchain + md->eptrn++;
570 p->epb_saved_eptr = eptr;
575 /* Now start processing the opcodes. */
579 minimize = possessive = FALSE;
582 /* For partial matching, remember if we ever hit the end of the subject after
583 matching at least one subject character. */
586 eptr >= md->end_subject &&
587 eptr > md->start_match)
592 /* Handle a capturing bracket. If there is space in the offset vector, save
593 the current subject position in the working slot at the top of the vector.
594 We mustn't change the current values of the data slot, because they may be
595 set from a previous iteration of this group, and be referred to by a
596 reference inside the group.
598 If the bracket fails to match, we need to restore this value and also the
599 values of the final offsets, in case they were set by a previous iteration
602 If there isn't enough space in the offset vector, treat this as if it were
603 a non-capturing bracket. Don't worry about setting the flag for the error
604 case here; that is handled in the code for KET. */
608 number = GET2(ecode, 1+LINK_SIZE);
609 offset = number << 1;
612 printf("start bracket %d\n", number);
614 pchars(eptr, 16, TRUE, md);
618 if (offset < md->offset_max)
620 save_offset1 = md->offset_vector[offset];
621 save_offset2 = md->offset_vector[offset+1];
622 save_offset3 = md->offset_vector[md->offset_end - number];
623 save_capture_last = md->capture_last;
625 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
626 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
628 flags = (op == OP_SCBRA)? match_cbegroup : 0;
631 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
633 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
634 md->capture_last = save_capture_last;
635 ecode += GET(ecode, 1);
637 while (*ecode == OP_ALT);
639 DPRINTF(("bracket %d failed\n", number));
641 md->offset_vector[offset] = save_offset1;
642 md->offset_vector[offset+1] = save_offset2;
643 md->offset_vector[md->offset_end - number] = save_offset3;
645 RRETURN(MATCH_NOMATCH);
648 /* Insufficient room for saving captured contents. Treat as a non-capturing
651 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
653 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
654 final alternative within the brackets, we would return the result of a
655 recursive call to match() whatever happened. We can reduce stack usage by
656 turning this into a tail recursion. */
660 DPRINTF(("start non-capturing bracket\n"));
661 flags = (op >= OP_SBRA)? match_cbegroup : 0;
664 if (ecode[GET(ecode, 1)] != OP_ALT)
666 ecode += _pcre_OP_lengths[*ecode];
667 flags |= match_tail_recursed;
668 DPRINTF(("bracket 0 tail recursion\n"));
672 /* For non-final alternatives, continue the loop for a NOMATCH result;
675 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
677 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
678 ecode += GET(ecode, 1);
680 /* Control never reaches here. */
682 /* Conditional group: compilation checked that there are no more than
683 two branches. If the condition is false, skipping the first branch takes us
684 past the end if there is only one branch, but that's OK because that is
685 exactly what going to the ket would do. As there is only one branch to be
686 obeyed, we can use tail recursion to avoid using another stack frame. */
690 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
692 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
693 condition = md->recursive != NULL &&
694 (offset == RREF_ANY || offset == md->recursive->group_num);
695 ecode += condition? 3 : GET(ecode, 1);
698 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
700 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
701 condition = offset < offset_top && md->offset_vector[offset] >= 0;
702 ecode += condition? 3 : GET(ecode, 1);
705 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
708 ecode += GET(ecode, 1);
711 /* The condition is an assertion. Call match() to evaluate it - setting
712 the final argument match_condassert causes it to stop at the end of an
717 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
719 if (rrc == MATCH_MATCH)
722 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
723 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
725 else if (rrc != MATCH_NOMATCH)
727 RRETURN(rrc); /* Need braces because of following else */
732 ecode += GET(ecode, 1);
736 /* We are now at the branch that is to be obeyed. As there is only one,
737 we can use tail recursion to avoid using another stack frame. If the second
738 alternative doesn't exist, we can just plough on. */
740 if (condition || *ecode == OP_ALT)
742 ecode += 1 + LINK_SIZE;
743 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
748 ecode += 1 + LINK_SIZE;
753 /* End of the pattern. If we are in a top-level recursion, we should
754 restore the offsets appropriately and continue from after the call. */
757 if (md->recursive != NULL && md->recursive->group_num == 0)
759 recursion_info *rec = md->recursive;
760 DPRINTF(("End of pattern in a (?0) recursion\n"));
761 md->recursive = rec->prevrec;
762 memmove(md->offset_vector, rec->offset_save,
763 rec->saved_max * sizeof(int));
764 md->start_match = rec->save_start;
766 ecode = rec->after_call;
770 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
771 string - backtracking will then try other alternatives, if any. */
773 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
774 md->end_match_ptr = eptr; /* Record where we ended */
775 md->end_offset_top = offset_top; /* and how many extracts were taken */
776 RRETURN(MATCH_MATCH);
778 /* Change option settings */
783 DPRINTF(("ims set to %02lx\n", ims));
786 /* Assertion brackets. Check the alternative branches in turn - the
787 matching won't pass the KET for an assertion. If any one branch matches,
788 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
789 start of each branch to move the current point backwards, so the code at
790 this level is identical to the lookahead case. */
796 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
797 if (rrc == MATCH_MATCH) break;
798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
799 ecode += GET(ecode, 1);
801 while (*ecode == OP_ALT);
802 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
804 /* If checking an assertion for a condition, return MATCH_MATCH. */
806 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
808 /* Continue from after the assertion, updating the offsets high water
809 mark, since extracts may have been taken during the assertion. */
811 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
812 ecode += 1 + LINK_SIZE;
813 offset_top = md->end_offset_top;
816 /* Negative assertion: all branches must fail to match */
819 case OP_ASSERTBACK_NOT:
822 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
823 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
825 ecode += GET(ecode,1);
827 while (*ecode == OP_ALT);
829 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
831 ecode += 1 + LINK_SIZE;
834 /* Move the subject pointer back. This occurs only at the start of
835 each branch of a lookbehind assertion. If we are too close to the start to
836 move back, this match function fails. When working with UTF-8 we move
837 back a number of characters, not bytes. */
847 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
854 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
857 eptr -= GET(ecode, 1);
858 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
861 /* Skip to next op code */
863 ecode += 1 + LINK_SIZE;
866 /* The callout item calls an external function, if one is provided, passing
867 details of the match so far. This is mainly for debugging, though the
868 function is able to force a failure. */
871 if (pcre_callout != NULL)
873 pcre_callout_block cb;
874 cb.version = 1; /* Version 1 of the callout block */
875 cb.callout_number = ecode[1];
876 cb.offset_vector = md->offset_vector;
877 cb.subject = (PCRE_SPTR)md->start_subject;
878 cb.subject_length = md->end_subject - md->start_subject;
879 cb.start_match = md->start_match - md->start_subject;
880 cb.current_position = eptr - md->start_subject;
881 cb.pattern_position = GET(ecode, 2);
882 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
883 cb.capture_top = offset_top/2;
884 cb.capture_last = md->capture_last;
885 cb.callout_data = md->callout_data;
886 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
887 if (rrc < 0) RRETURN(rrc);
889 ecode += 2 + 2*LINK_SIZE;
892 /* Recursion either matches the current regex, or some subexpression. The
893 offset data is the offset to the starting bracket from the start of the
894 whole pattern. (This is so that it works from duplicated subpatterns.)
896 If there are any capturing brackets started but not finished, we have to
897 save their starting points and reinstate them after the recursion. However,
898 we don't know how many such there are (offset_top records the completed
899 total) so we just have to save all the potential data. There may be up to
900 65535 such values, which is too large to put on the stack, but using malloc
901 for small numbers seems expensive. As a compromise, the stack is used when
902 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
903 is used. A problem is what to do if the malloc fails ... there is no way of
904 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
905 values on the stack, and accept that the rest may be wrong.
907 There are also other values that have to be saved. We use a chained
908 sequence of blocks that actually live on the stack. Thanks to Robin Houston
909 for the original version of this logic. */
913 callpat = md->start_code + GET(ecode, 1);
914 new_recursive.group_num = (callpat == md->start_code)? 0 :
915 GET2(callpat, 1 + LINK_SIZE);
917 /* Add to "recursing stack" */
919 new_recursive.prevrec = md->recursive;
920 md->recursive = &new_recursive;
922 /* Find where to continue from afterwards */
924 ecode += 1 + LINK_SIZE;
925 new_recursive.after_call = ecode;
927 /* Now save the offset data. */
929 new_recursive.saved_max = md->offset_end;
930 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
931 new_recursive.offset_save = stacksave;
934 new_recursive.offset_save =
935 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
936 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
939 memcpy(new_recursive.offset_save, md->offset_vector,
940 new_recursive.saved_max * sizeof(int));
941 new_recursive.save_start = md->start_match;
942 md->start_match = eptr;
944 /* OK, now we can do the recursion. For each top-level alternative we
945 restore the offset and recursion data. */
947 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
948 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
951 RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
952 md, ims, eptrb, flags);
953 if (rrc == MATCH_MATCH)
955 DPRINTF(("Recursion matched\n"));
956 md->recursive = new_recursive.prevrec;
957 if (new_recursive.offset_save != stacksave)
958 (pcre_free)(new_recursive.offset_save);
959 RRETURN(MATCH_MATCH);
961 else if (rrc != MATCH_NOMATCH)
963 DPRINTF(("Recursion gave error %d\n", rrc));
967 md->recursive = &new_recursive;
968 memcpy(md->offset_vector, new_recursive.offset_save,
969 new_recursive.saved_max * sizeof(int));
970 callpat += GET(callpat, 1);
972 while (*callpat == OP_ALT);
974 DPRINTF(("Recursion didn't match\n"));
975 md->recursive = new_recursive.prevrec;
976 if (new_recursive.offset_save != stacksave)
977 (pcre_free)(new_recursive.offset_save);
978 RRETURN(MATCH_NOMATCH);
980 /* Control never reaches here */
982 /* "Once" brackets are like assertion brackets except that after a match,
983 the point in the subject string is not moved back. Thus there can never be
984 a move back into the brackets. Friedl calls these "atomic" subpatterns.
985 Check the alternative branches in turn - the matching won't pass the KET
986 for this kind of subpattern. If any one branch matches, we carry on as at
987 the end of a normal bracket, leaving the subject pointer. */
995 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
997 if (rrc == MATCH_MATCH) break;
998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
999 ecode += GET(ecode,1);
1001 while (*ecode == OP_ALT);
1003 /* If hit the end of the group (which could be repeated), fail */
1005 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1007 /* Continue as from after the assertion, updating the offsets high water
1008 mark, since extracts may have been taken. */
1010 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1012 offset_top = md->end_offset_top;
1013 eptr = md->end_match_ptr;
1015 /* For a non-repeating ket, just continue at this level. This also
1016 happens for a repeating ket if no characters were matched in the group.
1017 This is the forcible breaking of infinite loops as implemented in Perl
1018 5.005. If there is an options reset, it will get obeyed in the normal
1019 course of events. */
1021 if (*ecode == OP_KET || eptr == saved_eptr)
1023 ecode += 1+LINK_SIZE;
1027 /* The repeating kets try the rest of the pattern or restart from the
1028 preceding bracket, in the appropriate order. The second "call" of match()
1029 uses tail recursion, to avoid using another stack frame. We need to reset
1030 any options that changed within the bracket before re-running it, so
1031 check the next opcode. */
1033 if (ecode[1+LINK_SIZE] == OP_OPT)
1035 ims = (ims & ~PCRE_IMS) | ecode[4];
1036 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1039 if (*ecode == OP_KETRMIN)
1041 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1044 flags = match_tail_recursed;
1047 else /* OP_KETRMAX */
1049 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
1050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1051 ecode += 1 + LINK_SIZE;
1052 flags = match_tail_recursed;
1055 /* Control never gets here */
1057 /* An alternation is the end of a branch; scan along to find the end of the
1058 bracketed group and go to there. */
1061 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1064 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1065 that it may occur zero times. It may repeat infinitely, or not at all -
1066 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1067 repeat limits are compiled as a number of copies, with the optional ones
1068 preceded by BRAZERO or BRAMINZERO. */
1073 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
1074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1075 do next += GET(next,1); while (*next == OP_ALT);
1076 ecode = next + 1 + LINK_SIZE;
1083 do next += GET(next, 1); while (*next == OP_ALT);
1084 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1085 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1090 /* End of a group, repeated or non-repeating. */
1095 prev = ecode - GET(ecode, 1);
1097 /* If this was a group that remembered the subject start, in order to break
1098 infinite repeats of empty string matches, retrieve the subject start from
1099 the chain. Otherwise, set it NULL. */
1101 if (*prev >= OP_SBRA)
1103 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1104 eptrb = eptrb->epb_prev; /* Backup to previous group */
1106 else saved_eptr = NULL;
1108 /* If we are at the end of an assertion group, stop matching and return
1109 MATCH_MATCH, but record the current high water mark for use by positive
1110 assertions. Do this also for the "once" (atomic) groups. */
1112 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1113 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1116 md->end_match_ptr = eptr; /* For ONCE */
1117 md->end_offset_top = offset_top;
1118 RRETURN(MATCH_MATCH);
1121 /* For capturing groups we have to check the group number back at the start
1122 and if necessary complete handling an extraction by setting the offsets and
1123 bumping the high water mark. Note that whole-pattern recursion is coded as
1124 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1125 when the OP_END is reached. Other recursion is handled here. */
1127 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1129 number = GET2(prev, 1+LINK_SIZE);
1130 offset = number << 1;
1133 printf("end bracket %d", number);
1137 md->capture_last = number;
1138 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1140 md->offset_vector[offset] =
1141 md->offset_vector[md->offset_end - number];
1142 md->offset_vector[offset+1] = eptr - md->start_subject;
1143 if (offset_top <= offset) offset_top = offset + 2;
1146 /* Handle a recursively called group. Restore the offsets
1147 appropriately and continue from after the call. */
1149 if (md->recursive != NULL && md->recursive->group_num == number)
1151 recursion_info *rec = md->recursive;
1152 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1153 md->recursive = rec->prevrec;
1154 md->start_match = rec->save_start;
1155 memcpy(md->offset_vector, rec->offset_save,
1156 rec->saved_max * sizeof(int));
1157 ecode = rec->after_call;
1163 /* For both capturing and non-capturing groups, reset the value of the ims
1164 flags, in case they got changed during the group. */
1167 DPRINTF(("ims reset to %02lx\n", ims));
1169 /* For a non-repeating ket, just continue at this level. This also
1170 happens for a repeating ket if no characters were matched in the group.
1171 This is the forcible breaking of infinite loops as implemented in Perl
1172 5.005. If there is an options reset, it will get obeyed in the normal
1173 course of events. */
1175 if (*ecode == OP_KET || eptr == saved_eptr)
1177 ecode += 1 + LINK_SIZE;
1181 /* The repeating kets try the rest of the pattern or restart from the
1182 preceding bracket, in the appropriate order. In the second case, we can use
1183 tail recursion to avoid using another stack frame. */
1185 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1187 if (*ecode == OP_KETRMIN)
1189 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1190 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192 flags |= match_tail_recursed;
1195 else /* OP_KETRMAX */
1197 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
1198 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1199 ecode += 1 + LINK_SIZE;
1200 flags = match_tail_recursed;
1203 /* Control never gets here */
1205 /* Start of subject unless notbol, or after internal newline if multiline */
1208 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1209 if ((ims & PCRE_MULTILINE) != 0)
1211 if (eptr != md->start_subject &&
1212 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1213 RRETURN(MATCH_NOMATCH);
1217 /* ... else fall through */
1219 /* Start of subject assertion */
1222 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1226 /* Start of match assertion */
1229 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1233 /* Assert before internal newline if multiline, or before a terminating
1234 newline unless endonly is set, else end of subject unless noteol is set. */
1237 if ((ims & PCRE_MULTILINE) != 0)
1239 if (eptr < md->end_subject)
1240 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1242 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1248 if (md->noteol) RRETURN(MATCH_NOMATCH);
1251 if (eptr != md->end_subject &&
1252 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1253 RRETURN(MATCH_NOMATCH);
1258 /* ... else fall through for endonly */
1260 /* End of subject assertion (\z) */
1263 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1267 /* End of subject or ending \n assertion (\Z) */
1270 if (eptr != md->end_subject &&
1271 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1272 RRETURN(MATCH_NOMATCH);
1276 /* Word boundary assertions */
1278 case OP_NOT_WORD_BOUNDARY:
1279 case OP_WORD_BOUNDARY:
1282 /* Find out if the previous and current characters are "word" characters.
1283 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1284 be "non-word" characters. */
1289 if (eptr == md->start_subject) prev_is_word = FALSE; else
1291 const uschar *lastptr = eptr - 1;
1292 while((*lastptr & 0xc0) == 0x80) lastptr--;
1293 GETCHAR(c, lastptr);
1294 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1296 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1299 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1305 /* More streamlined when not in UTF-8 mode */
1308 prev_is_word = (eptr != md->start_subject) &&
1309 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1310 cur_is_word = (eptr < md->end_subject) &&
1311 ((md->ctypes[*eptr] & ctype_word) != 0);
1314 /* Now see if the situation is what we want */
1316 if ((*ecode++ == OP_WORD_BOUNDARY)?
1317 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1318 RRETURN(MATCH_NOMATCH);
1322 /* Match a single character type; inline for speed */
1325 if ((ims & PCRE_DOTALL) == 0)
1327 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1329 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1331 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1335 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1336 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1339 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1344 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1345 GETCHARINCTEST(c, eptr);
1350 (md->ctypes[c] & ctype_digit) != 0
1352 RRETURN(MATCH_NOMATCH);
1357 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1358 GETCHARINCTEST(c, eptr);
1363 (md->ctypes[c] & ctype_digit) == 0
1365 RRETURN(MATCH_NOMATCH);
1369 case OP_NOT_WHITESPACE:
1370 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1371 GETCHARINCTEST(c, eptr);
1376 (md->ctypes[c] & ctype_space) != 0
1378 RRETURN(MATCH_NOMATCH);
1383 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1384 GETCHARINCTEST(c, eptr);
1389 (md->ctypes[c] & ctype_space) == 0
1391 RRETURN(MATCH_NOMATCH);
1395 case OP_NOT_WORDCHAR:
1396 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1397 GETCHARINCTEST(c, eptr);
1402 (md->ctypes[c] & ctype_word) != 0
1404 RRETURN(MATCH_NOMATCH);
1409 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1410 GETCHARINCTEST(c, eptr);
1415 (md->ctypes[c] & ctype_word) == 0
1417 RRETURN(MATCH_NOMATCH);
1422 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1423 GETCHARINCTEST(c, eptr);
1426 default: RRETURN(MATCH_NOMATCH);
1428 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1442 /* Check the next character by Unicode property. We will get here only
1443 if the support is in the binary; otherwise a compile-time error occurs. */
1447 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1448 GETCHARINCTEST(c, eptr);
1450 int chartype, script;
1451 int category = _pcre_ucp_findprop(c, &chartype, &script);
1456 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1460 if ((chartype == ucp_Lu ||
1461 chartype == ucp_Ll ||
1462 chartype == ucp_Lt) == (op == OP_NOTPROP))
1463 RRETURN(MATCH_NOMATCH);
1467 if ((ecode[2] != category) == (op == OP_PROP))
1468 RRETURN(MATCH_NOMATCH);
1472 if ((ecode[2] != chartype) == (op == OP_PROP))
1473 RRETURN(MATCH_NOMATCH);
1477 if ((ecode[2] != script) == (op == OP_PROP))
1478 RRETURN(MATCH_NOMATCH);
1482 RRETURN(PCRE_ERROR_INTERNAL);
1489 /* Match an extended Unicode sequence. We will get here only if the support
1490 is in the binary; otherwise a compile-time error occurs. */
1493 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1494 GETCHARINCTEST(c, eptr);
1496 int chartype, script;
1497 int category = _pcre_ucp_findprop(c, &chartype, &script);
1498 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1499 while (eptr < md->end_subject)
1502 if (!utf8) c = *eptr; else
1504 GETCHARLEN(c, eptr, len);
1506 category = _pcre_ucp_findprop(c, &chartype, &script);
1507 if (category != ucp_M) break;
1516 /* Match a back reference, possibly repeatedly. Look past the end of the
1517 item to see if there is repeat information following. The code is similar
1518 to that for character classes, but repeated for efficiency. Then obey
1519 similar code to character type repeats - written out again for speed.
1520 However, if the referenced string is the empty string, always treat
1521 it as matched, any number of times (otherwise there could be infinite
1526 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1527 ecode += 3; /* Advance past item */
1529 /* If the reference is unset, set the length to be longer than the amount
1530 of subject left; this ensures that every attempt at a match fails. We
1531 can't just fail here, because of the possibility of quantifiers with zero
1534 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1535 md->end_subject - eptr + 1 :
1536 md->offset_vector[offset+1] - md->offset_vector[offset];
1538 /* Set up for repetition, or handle the non-repeated case */
1548 c = *ecode++ - OP_CRSTAR;
1549 minimize = (c & 1) != 0;
1550 min = rep_min[c]; /* Pick up values from tables; */
1551 max = rep_max[c]; /* zero for max => infinity */
1552 if (max == 0) max = INT_MAX;
1557 minimize = (*ecode == OP_CRMINRANGE);
1558 min = GET2(ecode, 1);
1559 max = GET2(ecode, 3);
1560 if (max == 0) max = INT_MAX;
1564 default: /* No repeat follows */
1565 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1567 continue; /* With the main loop */
1570 /* If the length of the reference is zero, just continue with the
1573 if (length == 0) continue;
1575 /* First, ensure the minimum number of matches are present. We get back
1576 the length of the reference string explicitly rather than passing the
1577 address of eptr, so that eptr can be a register variable. */
1579 for (i = 1; i <= min; i++)
1581 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1585 /* If min = max, continue at the same level without recursion.
1586 They are not both allowed to be zero. */
1588 if (min == max) continue;
1590 /* If minimizing, keep trying and advancing the pointer */
1594 for (fi = min;; fi++)
1596 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1597 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1598 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1599 RRETURN(MATCH_NOMATCH);
1602 /* Control never gets here */
1605 /* If maximizing, find the longest string and work backwards */
1610 for (i = min; i < max; i++)
1612 if (!match_ref(offset, eptr, length, md, ims)) break;
1617 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1621 RRETURN(MATCH_NOMATCH);
1624 /* Control never gets here */
1628 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1629 used when all the characters in the class have values in the range 0-255,
1630 and either the matching is caseful, or the characters are in the range
1631 0-127 when UTF-8 processing is enabled. The only difference between
1632 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1635 First, look past the end of the item to see if there is repeat information
1636 following. Then obey similar code to character type repeats - written out
1642 data = ecode + 1; /* Save for matching */
1643 ecode += 33; /* Advance past the item */
1653 c = *ecode++ - OP_CRSTAR;
1654 minimize = (c & 1) != 0;
1655 min = rep_min[c]; /* Pick up values from tables; */
1656 max = rep_max[c]; /* zero for max => infinity */
1657 if (max == 0) max = INT_MAX;
1662 minimize = (*ecode == OP_CRMINRANGE);
1663 min = GET2(ecode, 1);
1664 max = GET2(ecode, 3);
1665 if (max == 0) max = INT_MAX;
1669 default: /* No repeat follows */
1674 /* First, ensure the minimum number of matches are present. */
1680 for (i = 1; i <= min; i++)
1682 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1683 GETCHARINC(c, eptr);
1686 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1690 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1696 /* Not UTF-8 mode */
1698 for (i = 1; i <= min; i++)
1700 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1702 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1706 /* If max == min we can continue with the main loop without the
1709 if (min == max) continue;
1711 /* If minimizing, keep testing the rest of the expression and advancing
1712 the pointer while it matches the class. */
1720 for (fi = min;; fi++)
1722 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1724 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1725 GETCHARINC(c, eptr);
1728 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1732 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1738 /* Not UTF-8 mode */
1740 for (fi = min;; fi++)
1742 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1743 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1744 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1746 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1749 /* Control never gets here */
1752 /* If maximizing, find the longest possible run, then work backwards. */
1762 for (i = min; i < max; i++)
1765 if (eptr >= md->end_subject) break;
1766 GETCHARLEN(c, eptr, len);
1769 if (op == OP_CLASS) break;
1773 if ((data[c/8] & (1 << (c&7))) == 0) break;
1779 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1780 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1781 if (eptr-- == pp) break; /* Stop if tried at original pos */
1787 /* Not UTF-8 mode */
1789 for (i = min; i < max; i++)
1791 if (eptr >= md->end_subject) break;
1793 if ((data[c/8] & (1 << (c&7))) == 0) break;
1798 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1799 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1804 RRETURN(MATCH_NOMATCH);
1807 /* Control never gets here */
1810 /* Match an extended character class. This opcode is encountered only
1811 in UTF-8 mode, because that's the only time it is compiled. */
1816 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1817 ecode += GET(ecode, 1); /* Advance past the item */
1827 c = *ecode++ - OP_CRSTAR;
1828 minimize = (c & 1) != 0;
1829 min = rep_min[c]; /* Pick up values from tables; */
1830 max = rep_max[c]; /* zero for max => infinity */
1831 if (max == 0) max = INT_MAX;
1836 minimize = (*ecode == OP_CRMINRANGE);
1837 min = GET2(ecode, 1);
1838 max = GET2(ecode, 3);
1839 if (max == 0) max = INT_MAX;
1843 default: /* No repeat follows */
1848 /* First, ensure the minimum number of matches are present. */
1850 for (i = 1; i <= min; i++)
1852 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1853 GETCHARINC(c, eptr);
1854 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1857 /* If max == min we can continue with the main loop without the
1860 if (min == max) continue;
1862 /* If minimizing, keep testing the rest of the expression and advancing
1863 the pointer while it matches the class. */
1867 for (fi = min;; fi++)
1869 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1870 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1871 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1872 GETCHARINC(c, eptr);
1873 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1875 /* Control never gets here */
1878 /* If maximizing, find the longest possible run, then work backwards. */
1883 for (i = min; i < max; i++)
1886 if (eptr >= md->end_subject) break;
1887 GETCHARLEN(c, eptr, len);
1888 if (!_pcre_xclass(c, data)) break;
1893 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1894 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1895 if (eptr-- == pp) break; /* Stop if tried at original pos */
1898 RRETURN(MATCH_NOMATCH);
1901 /* Control never gets here */
1903 #endif /* End of XCLASS */
1905 /* Match a single character, casefully */
1913 GETCHARLEN(fc, ecode, length);
1914 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1915 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1920 /* Non-UTF-8 mode */
1922 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1923 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1928 /* Match a single character, caselessly */
1936 GETCHARLEN(fc, ecode, length);
1938 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1940 /* If the pattern character's value is < 128, we have only one byte, and
1941 can use the fast lookup table. */
1945 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1948 /* Otherwise we must pick up the subject character */
1953 GETCHARINC(dc, eptr);
1956 /* If we have Unicode property support, we can use it to test the other
1957 case of the character, if there is one. */
1962 if (dc != _pcre_ucp_othercase(fc))
1964 RRETURN(MATCH_NOMATCH);
1969 #endif /* SUPPORT_UTF8 */
1971 /* Non-UTF-8 mode */
1973 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1974 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1979 /* Match a single character repeatedly. */
1982 min = max = GET2(ecode, 1);
1993 max = GET2(ecode, 1);
1994 minimize = *ecode == OP_MINUPTO;
2025 c = *ecode++ - OP_STAR;
2026 minimize = (c & 1) != 0;
2027 min = rep_min[c]; /* Pick up values from tables; */
2028 max = rep_max[c]; /* zero for max => infinity */
2029 if (max == 0) max = INT_MAX;
2031 /* Common code for all repeated single-character matches. We can give
2032 up quickly if there are fewer than the minimum number of characters left in
2041 GETCHARLEN(fc, ecode, length);
2042 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2045 /* Handle multibyte character matching specially here. There is
2046 support for caseless matching if UCP support is present. */
2054 unsigned int othercase;
2055 if ((ims & PCRE_CASELESS) != 0 &&
2056 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2057 oclength = _pcre_ord2utf8(othercase, occhars);
2058 #endif /* SUPPORT_UCP */
2060 for (i = 1; i <= min; i++)
2062 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2063 /* Need braces because of following else */
2064 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2067 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2072 if (min == max) continue;
2076 for (fi = min;; fi++)
2078 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2079 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2080 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2081 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2082 /* Need braces because of following else */
2083 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2086 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2090 /* Control never gets here */
2096 for (i = min; i < max; i++)
2098 if (eptr > md->end_subject - length) break;
2099 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2100 else if (oclength == 0) break;
2103 if (memcmp(eptr, occhars, oclength) != 0) break;
2108 if (possessive) continue;
2111 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2112 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2115 RRETURN(MATCH_NOMATCH);
2117 /* Control never gets here */
2120 /* If the length of a UTF-8 character is 1, we fall through here, and
2121 obey the code as for non-UTF-8 characters below, though in this case the
2122 value of fc will always be < 128. */
2125 #endif /* SUPPORT_UTF8 */
2127 /* When not in UTF-8 mode, load a single-byte character. */
2129 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2133 /* The value of fc at this point is always less than 256, though we may or
2134 may not be in UTF-8 mode. The code is duplicated for the caseless and
2135 caseful cases, for speed, since matching characters is likely to be quite
2136 common. First, ensure the minimum number of matches are present. If min =
2137 max, continue at the same level without recursing. Otherwise, if
2138 minimizing, keep trying the rest of the expression and advancing one
2139 matching character if failing, up to the maximum. Alternatively, if
2140 maximizing, find the maximum number of characters and work backwards. */
2142 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2145 if ((ims & PCRE_CASELESS) != 0)
2148 for (i = 1; i <= min; i++)
2149 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2150 if (min == max) continue;
2153 for (fi = min;; fi++)
2155 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2156 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2157 if (fi >= max || eptr >= md->end_subject ||
2158 fc != md->lcc[*eptr++])
2159 RRETURN(MATCH_NOMATCH);
2161 /* Control never gets here */
2166 for (i = min; i < max; i++)
2168 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2171 if (possessive) continue;
2174 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2176 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2178 RRETURN(MATCH_NOMATCH);
2180 /* Control never gets here */
2183 /* Caseful comparisons (includes all multi-byte characters) */
2187 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2188 if (min == max) continue;
2191 for (fi = min;; fi++)
2193 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2195 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2196 RRETURN(MATCH_NOMATCH);
2198 /* Control never gets here */
2203 for (i = min; i < max; i++)
2205 if (eptr >= md->end_subject || fc != *eptr) break;
2208 if (possessive) continue;
2211 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2213 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2215 RRETURN(MATCH_NOMATCH);
2218 /* Control never gets here */
2220 /* Match a negated single one-byte character. The character we are
2221 checking can be multibyte. */
2224 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2226 GETCHARINCTEST(c, eptr);
2227 if ((ims & PCRE_CASELESS) != 0)
2233 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2237 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2241 /* Match a negated single one-byte character repeatedly. This is almost a
2242 repeat of the code for a repeated single character, but I haven't found a
2243 nice way of commoning these up that doesn't require a test of the
2244 positive/negative option for each character match. Maybe that wouldn't add
2245 very much to the time taken, but character matching *is* what this is all
2249 min = max = GET2(ecode, 1);
2256 max = GET2(ecode, 1);
2257 minimize = *ecode == OP_NOTMINUPTO;
2275 case OP_NOTPOSQUERY:
2285 max = GET2(ecode, 1);
2294 case OP_NOTMINQUERY:
2295 c = *ecode++ - OP_NOTSTAR;
2296 minimize = (c & 1) != 0;
2297 min = rep_min[c]; /* Pick up values from tables; */
2298 max = rep_max[c]; /* zero for max => infinity */
2299 if (max == 0) max = INT_MAX;
2301 /* Common code for all repeated single-byte matches. We can give up quickly
2302 if there are fewer than the minimum number of bytes left in the
2306 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2309 /* The code is duplicated for the caseless and caseful cases, for speed,
2310 since matching characters is likely to be quite common. First, ensure the
2311 minimum number of matches are present. If min = max, continue at the same
2312 level without recursing. Otherwise, if minimizing, keep trying the rest of
2313 the expression and advancing one matching character if failing, up to the
2314 maximum. Alternatively, if maximizing, find the maximum number of
2315 characters and work backwards. */
2317 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2320 if ((ims & PCRE_CASELESS) != 0)
2328 register unsigned int d;
2329 for (i = 1; i <= min; i++)
2331 GETCHARINC(d, eptr);
2332 if (d < 256) d = md->lcc[d];
2333 if (fc == d) RRETURN(MATCH_NOMATCH);
2339 /* Not UTF-8 mode */
2341 for (i = 1; i <= min; i++)
2342 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2345 if (min == max) continue;
2353 register unsigned int d;
2354 for (fi = min;; fi++)
2356 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2357 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2358 GETCHARINC(d, eptr);
2359 if (d < 256) d = md->lcc[d];
2360 if (fi >= max || eptr >= md->end_subject || fc == d)
2361 RRETURN(MATCH_NOMATCH);
2366 /* Not UTF-8 mode */
2368 for (fi = min;; fi++)
2370 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2371 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2372 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2373 RRETURN(MATCH_NOMATCH);
2376 /* Control never gets here */
2389 register unsigned int d;
2390 for (i = min; i < max; i++)
2393 if (eptr >= md->end_subject) break;
2394 GETCHARLEN(d, eptr, len);
2395 if (d < 256) d = md->lcc[d];
2399 if (possessive) continue;
2402 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2403 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2404 if (eptr-- == pp) break; /* Stop if tried at original pos */
2410 /* Not UTF-8 mode */
2412 for (i = min; i < max; i++)
2414 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2417 if (possessive) continue;
2420 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2421 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2426 RRETURN(MATCH_NOMATCH);
2428 /* Control never gets here */
2431 /* Caseful comparisons */
2439 register unsigned int d;
2440 for (i = 1; i <= min; i++)
2442 GETCHARINC(d, eptr);
2443 if (fc == d) RRETURN(MATCH_NOMATCH);
2448 /* Not UTF-8 mode */
2450 for (i = 1; i <= min; i++)
2451 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2454 if (min == max) continue;
2462 register unsigned int d;
2463 for (fi = min;; fi++)
2465 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2466 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2467 GETCHARINC(d, eptr);
2468 if (fi >= max || eptr >= md->end_subject || fc == d)
2469 RRETURN(MATCH_NOMATCH);
2474 /* Not UTF-8 mode */
2476 for (fi = min;; fi++)
2478 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2479 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2480 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2481 RRETURN(MATCH_NOMATCH);
2484 /* Control never gets here */
2497 register unsigned int d;
2498 for (i = min; i < max; i++)
2501 if (eptr >= md->end_subject) break;
2502 GETCHARLEN(d, eptr, len);
2506 if (possessive) continue;
2509 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2510 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2511 if (eptr-- == pp) break; /* Stop if tried at original pos */
2517 /* Not UTF-8 mode */
2519 for (i = min; i < max; i++)
2521 if (eptr >= md->end_subject || fc == *eptr) break;
2524 if (possessive) continue;
2527 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2528 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2533 RRETURN(MATCH_NOMATCH);
2536 /* Control never gets here */
2538 /* Match a single character type repeatedly; several different opcodes
2539 share code. This is very similar to the code for single characters, but we
2540 repeat it in the interests of efficiency. */
2543 min = max = GET2(ecode, 1);
2549 case OP_TYPEMINUPTO:
2551 max = GET2(ecode, 1);
2552 minimize = *ecode == OP_TYPEMINUPTO;
2556 case OP_TYPEPOSSTAR:
2563 case OP_TYPEPOSPLUS:
2570 case OP_TYPEPOSQUERY:
2577 case OP_TYPEPOSUPTO:
2580 max = GET2(ecode, 1);
2585 case OP_TYPEMINSTAR:
2587 case OP_TYPEMINPLUS:
2589 case OP_TYPEMINQUERY:
2590 c = *ecode++ - OP_TYPESTAR;
2591 minimize = (c & 1) != 0;
2592 min = rep_min[c]; /* Pick up values from tables; */
2593 max = rep_max[c]; /* zero for max => infinity */
2594 if (max == 0) max = INT_MAX;
2596 /* Common code for all repeated single character type matches. Note that
2597 in UTF-8 mode, '.' matches a character of any length, but for the other
2598 character types, the valid characters are all one-byte long. */
2601 ctype = *ecode++; /* Code for the character type */
2604 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2606 prop_fail_result = ctype == OP_NOTPROP;
2607 prop_type = *ecode++;
2608 prop_value = *ecode++;
2610 else prop_type = -1;
2613 /* First, ensure the minimum number of matches are present. Use inline
2614 code for maximizing the speed, and do the type test once at the start
2615 (i.e. keep it out of the loop). Also we can test that there are at least
2616 the minimum number of bytes before we start. This isn't as effective in
2617 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2618 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2619 and single-bytes. */
2621 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2630 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2631 for (i = 1; i <= min; i++)
2633 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2634 GETCHARINC(c, eptr);
2639 for (i = 1; i <= min; i++)
2641 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2642 GETCHARINC(c, eptr);
2643 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2644 if ((prop_chartype == ucp_Lu ||
2645 prop_chartype == ucp_Ll ||
2646 prop_chartype == ucp_Lt) == prop_fail_result)
2647 RRETURN(MATCH_NOMATCH);
2652 for (i = 1; i <= min; i++)
2654 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2655 GETCHARINC(c, eptr);
2656 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2657 if ((prop_category == prop_value) == prop_fail_result)
2658 RRETURN(MATCH_NOMATCH);
2663 for (i = 1; i <= min; i++)
2665 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2666 GETCHARINC(c, eptr);
2667 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2668 if ((prop_chartype == prop_value) == prop_fail_result)
2669 RRETURN(MATCH_NOMATCH);
2674 for (i = 1; i <= min; i++)
2676 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2677 GETCHARINC(c, eptr);
2678 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2679 if ((prop_script == prop_value) == prop_fail_result)
2680 RRETURN(MATCH_NOMATCH);
2685 RRETURN(PCRE_ERROR_INTERNAL);
2689 /* Match extended Unicode sequences. We will get here only if the
2690 support is in the binary; otherwise a compile-time error occurs. */
2692 else if (ctype == OP_EXTUNI)
2694 for (i = 1; i <= min; i++)
2696 GETCHARINCTEST(c, eptr);
2697 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2698 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2699 while (eptr < md->end_subject)
2702 if (!utf8) c = *eptr; else
2704 GETCHARLEN(c, eptr, len);
2706 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2707 if (prop_category != ucp_M) break;
2714 #endif /* SUPPORT_UCP */
2716 /* Handle all other cases when the coding is UTF-8 */
2719 if (utf8) switch(ctype)
2722 for (i = 1; i <= min; i++)
2724 if (eptr >= md->end_subject ||
2725 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2726 RRETURN(MATCH_NOMATCH);
2728 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2737 for (i = 1; i <= min; i++)
2739 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2740 GETCHARINC(c, eptr);
2743 default: RRETURN(MATCH_NOMATCH);
2745 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2759 for (i = 1; i <= min; i++)
2761 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2762 GETCHARINC(c, eptr);
2763 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2764 RRETURN(MATCH_NOMATCH);
2769 for (i = 1; i <= min; i++)
2771 if (eptr >= md->end_subject ||
2772 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2773 RRETURN(MATCH_NOMATCH);
2774 /* No need to skip more bytes - we know it's a 1-byte character */
2778 case OP_NOT_WHITESPACE:
2779 for (i = 1; i <= min; i++)
2781 if (eptr >= md->end_subject ||
2782 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2783 RRETURN(MATCH_NOMATCH);
2784 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2789 for (i = 1; i <= min; i++)
2791 if (eptr >= md->end_subject ||
2792 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2793 RRETURN(MATCH_NOMATCH);
2794 /* No need to skip more bytes - we know it's a 1-byte character */
2798 case OP_NOT_WORDCHAR:
2799 for (i = 1; i <= min; i++)
2801 if (eptr >= md->end_subject ||
2802 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2803 RRETURN(MATCH_NOMATCH);
2804 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2809 for (i = 1; i <= min; i++)
2811 if (eptr >= md->end_subject ||
2812 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2813 RRETURN(MATCH_NOMATCH);
2814 /* No need to skip more bytes - we know it's a 1-byte character */
2819 RRETURN(PCRE_ERROR_INTERNAL);
2820 } /* End switch(ctype) */
2823 #endif /* SUPPORT_UTF8 */
2825 /* Code for the non-UTF-8 case for minimum matching of operators other
2826 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2827 number of bytes present, as this was tested above. */
2832 if ((ims & PCRE_DOTALL) == 0)
2834 for (i = 1; i <= min; i++)
2836 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2847 /* Because of the CRLF case, we can't assume the minimum number of
2848 bytes are present in this case. */
2851 for (i = 1; i <= min; i++)
2853 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2856 default: RRETURN(MATCH_NOMATCH);
2858 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2870 for (i = 1; i <= min; i++)
2871 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2875 for (i = 1; i <= min; i++)
2876 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2879 case OP_NOT_WHITESPACE:
2880 for (i = 1; i <= min; i++)
2881 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2885 for (i = 1; i <= min; i++)
2886 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2889 case OP_NOT_WORDCHAR:
2890 for (i = 1; i <= min; i++)
2891 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2892 RRETURN(MATCH_NOMATCH);
2896 for (i = 1; i <= min; i++)
2897 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2898 RRETURN(MATCH_NOMATCH);
2902 RRETURN(PCRE_ERROR_INTERNAL);
2906 /* If min = max, continue at the same level without recursing */
2908 if (min == max) continue;
2910 /* If minimizing, we have to test the rest of the pattern before each
2911 subsequent match. Again, separate the UTF-8 case for speed, and also
2912 separate the UCP cases. */
2922 for (fi = min;; fi++)
2924 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2925 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2926 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2927 GETCHARINC(c, eptr);
2928 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2930 /* Control never gets here */
2933 for (fi = min;; fi++)
2935 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2937 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2938 GETCHARINC(c, eptr);
2939 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2940 if ((prop_chartype == ucp_Lu ||
2941 prop_chartype == ucp_Ll ||
2942 prop_chartype == ucp_Lt) == prop_fail_result)
2943 RRETURN(MATCH_NOMATCH);
2945 /* Control never gets here */
2948 for (fi = min;; fi++)
2950 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2952 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2953 GETCHARINC(c, eptr);
2954 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2955 if ((prop_category == prop_value) == prop_fail_result)
2956 RRETURN(MATCH_NOMATCH);
2958 /* Control never gets here */
2961 for (fi = min;; fi++)
2963 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2964 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2965 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2966 GETCHARINC(c, eptr);
2967 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2968 if ((prop_chartype == prop_value) == prop_fail_result)
2969 RRETURN(MATCH_NOMATCH);
2971 /* Control never gets here */
2974 for (fi = min;; fi++)
2976 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2978 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2979 GETCHARINC(c, eptr);
2980 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2981 if ((prop_script == prop_value) == prop_fail_result)
2982 RRETURN(MATCH_NOMATCH);
2984 /* Control never gets here */
2987 RRETURN(PCRE_ERROR_INTERNAL);
2991 /* Match extended Unicode sequences. We will get here only if the
2992 support is in the binary; otherwise a compile-time error occurs. */
2994 else if (ctype == OP_EXTUNI)
2996 for (fi = min;; fi++)
2998 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2999 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3000 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3001 GETCHARINCTEST(c, eptr);
3002 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3003 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3004 while (eptr < md->end_subject)
3007 if (!utf8) c = *eptr; else
3009 GETCHARLEN(c, eptr, len);
3011 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3012 if (prop_category != ucp_M) break;
3019 #endif /* SUPPORT_UCP */
3025 for (fi = min;; fi++)
3027 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3028 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3029 if (fi >= max || eptr >= md->end_subject ||
3030 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3032 RRETURN(MATCH_NOMATCH);
3034 GETCHARINC(c, eptr);
3037 case OP_ANY: /* This is the DOTALL case */
3046 default: RRETURN(MATCH_NOMATCH);
3048 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3061 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3062 RRETURN(MATCH_NOMATCH);
3066 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3067 RRETURN(MATCH_NOMATCH);
3070 case OP_NOT_WHITESPACE:
3071 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3072 RRETURN(MATCH_NOMATCH);
3076 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3077 RRETURN(MATCH_NOMATCH);
3080 case OP_NOT_WORDCHAR:
3081 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3082 RRETURN(MATCH_NOMATCH);
3086 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3087 RRETURN(MATCH_NOMATCH);
3091 RRETURN(PCRE_ERROR_INTERNAL);
3097 /* Not UTF-8 mode */
3099 for (fi = min;; fi++)
3101 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3102 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3103 if (fi >= max || eptr >= md->end_subject ||
3104 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3105 RRETURN(MATCH_NOMATCH);
3110 case OP_ANY: /* This is the DOTALL case */
3119 default: RRETURN(MATCH_NOMATCH);
3121 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3132 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3136 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3139 case OP_NOT_WHITESPACE:
3140 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3144 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3147 case OP_NOT_WORDCHAR:
3148 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3152 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3156 RRETURN(PCRE_ERROR_INTERNAL);
3160 /* Control never gets here */
3163 /* If maximizing, it is worth using inline code for speed, doing the type
3164 test once at the start (i.e. keep it out of the loop). Again, keep the
3165 UTF-8 and UCP stuff separate. */
3169 pp = eptr; /* Remember where we started */
3177 for (i = min; i < max; i++)
3180 if (eptr >= md->end_subject) break;
3181 GETCHARLEN(c, eptr, len);
3182 if (prop_fail_result) break;
3188 for (i = min; i < max; i++)
3191 if (eptr >= md->end_subject) break;
3192 GETCHARLEN(c, eptr, len);
3193 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3194 if ((prop_chartype == ucp_Lu ||
3195 prop_chartype == ucp_Ll ||
3196 prop_chartype == ucp_Lt) == prop_fail_result)
3203 for (i = min; i < max; i++)
3206 if (eptr >= md->end_subject) break;
3207 GETCHARLEN(c, eptr, len);
3208 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3209 if ((prop_category == prop_value) == prop_fail_result)
3216 for (i = min; i < max; i++)
3219 if (eptr >= md->end_subject) break;
3220 GETCHARLEN(c, eptr, len);
3221 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3222 if ((prop_chartype == prop_value) == prop_fail_result)
3229 for (i = min; i < max; i++)
3232 if (eptr >= md->end_subject) break;
3233 GETCHARLEN(c, eptr, len);
3234 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3235 if ((prop_script == prop_value) == prop_fail_result)
3242 /* eptr is now past the end of the maximum run */
3244 if (possessive) continue;
3247 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3248 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3249 if (eptr-- == pp) break; /* Stop if tried at original pos */
3254 /* Match extended Unicode sequences. We will get here only if the
3255 support is in the binary; otherwise a compile-time error occurs. */
3257 else if (ctype == OP_EXTUNI)
3259 for (i = min; i < max; i++)
3261 if (eptr >= md->end_subject) break;
3262 GETCHARINCTEST(c, eptr);
3263 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3264 if (prop_category == ucp_M) break;
3265 while (eptr < md->end_subject)
3268 if (!utf8) c = *eptr; else
3270 GETCHARLEN(c, eptr, len);
3272 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3273 if (prop_category != ucp_M) break;
3278 /* eptr is now past the end of the maximum run */
3280 if (possessive) continue;
3283 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3284 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3285 if (eptr-- == pp) break; /* Stop if tried at original pos */
3286 for (;;) /* Move back over one extended */
3290 if (!utf8) c = *eptr; else
3292 GETCHARLEN(c, eptr, len);
3294 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3295 if (prop_category != ucp_M) break;
3302 #endif /* SUPPORT_UCP */
3313 /* Special code is required for UTF8, but when the maximum is
3314 unlimited we don't need it, so we repeat the non-UTF8 code. This is
3315 probably worth it, because .* is quite a common idiom. */
3319 if ((ims & PCRE_DOTALL) == 0)
3321 for (i = min; i < max; i++)
3323 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3325 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3330 for (i = min; i < max; i++)
3332 if (eptr >= md->end_subject) break;
3334 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3339 /* Handle unlimited UTF-8 repeat */
3343 if ((ims & PCRE_DOTALL) == 0)
3345 for (i = min; i < max; i++)
3347 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3355 if (c > (unsigned int)(md->end_subject - eptr))
3356 c = md->end_subject - eptr;
3362 /* The byte case is the same as non-UTF8 */
3366 if (c > (unsigned int)(md->end_subject - eptr))
3367 c = md->end_subject - eptr;
3372 for (i = min; i < max; i++)
3375 if (eptr >= md->end_subject) break;
3376 GETCHARLEN(c, eptr, len);
3379 if (++eptr >= md->end_subject) break;
3380 if (*eptr == 0x000a) eptr++;
3384 if (c != 0x000a && c != 0x000b && c != 0x000c &&
3385 c != 0x0085 && c != 0x2028 && c != 0x2029)
3393 for (i = min; i < max; i++)
3396 if (eptr >= md->end_subject) break;
3397 GETCHARLEN(c, eptr, len);
3398 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3404 for (i = min; i < max; i++)
3407 if (eptr >= md->end_subject) break;
3408 GETCHARLEN(c, eptr, len);
3409 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3414 case OP_NOT_WHITESPACE:
3415 for (i = min; i < max; i++)
3418 if (eptr >= md->end_subject) break;
3419 GETCHARLEN(c, eptr, len);
3420 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3426 for (i = min; i < max; i++)
3429 if (eptr >= md->end_subject) break;
3430 GETCHARLEN(c, eptr, len);
3431 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3436 case OP_NOT_WORDCHAR:
3437 for (i = min; i < max; i++)
3440 if (eptr >= md->end_subject) break;
3441 GETCHARLEN(c, eptr, len);
3442 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3448 for (i = min; i < max; i++)
3451 if (eptr >= md->end_subject) break;
3452 GETCHARLEN(c, eptr, len);
3453 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3459 RRETURN(PCRE_ERROR_INTERNAL);
3462 /* eptr is now past the end of the maximum run */
3464 if (possessive) continue;
3467 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3468 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3469 if (eptr-- == pp) break; /* Stop if tried at original pos */
3476 /* Not UTF-8 mode */
3481 if ((ims & PCRE_DOTALL) == 0)
3483 for (i = min; i < max; i++)
3485 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3490 /* For DOTALL case, fall through and treat as \C */
3494 if (c > (unsigned int)(md->end_subject - eptr))
3495 c = md->end_subject - eptr;
3500 for (i = min; i < max; i++)
3502 if (eptr >= md->end_subject) break;
3506 if (++eptr >= md->end_subject) break;
3507 if (*eptr == 0x000a) eptr++;
3511 if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3519 for (i = min; i < max; i++)
3521 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3528 for (i = min; i < max; i++)
3530 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3536 case OP_NOT_WHITESPACE:
3537 for (i = min; i < max; i++)
3539 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3546 for (i = min; i < max; i++)
3548 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3554 case OP_NOT_WORDCHAR:
3555 for (i = min; i < max; i++)
3557 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3564 for (i = min; i < max; i++)
3566 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3573 RRETURN(PCRE_ERROR_INTERNAL);
3576 /* eptr is now past the end of the maximum run */
3578 if (possessive) continue;
3581 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3583 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3587 /* Get here if we can't make it match with any permitted repetitions */
3589 RRETURN(MATCH_NOMATCH);
3591 /* Control never gets here */
3593 /* There's been some horrible disaster. Arrival here can only mean there is
3594 something seriously wrong in the code above or the OP_xxx definitions. */
3597 DPRINTF(("Unknown opcode %d\n", *ecode));
3598 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3601 /* Do not stick any code in here without much thought; it is assumed
3602 that "continue" in the code above comes out to here to repeat the main
3605 } /* End of main loop */
3606 /* Control never reaches here */
3610 /***************************************************************************
3611 ****************************************************************************
3612 RECURSION IN THE match() FUNCTION
3614 Undefine all the macros that were defined above to handle this. */
3632 #undef new_recursive
3647 #undef save_capture_last
3657 /* These two are defined as macros in both cases */
3662 /***************************************************************************
3663 ***************************************************************************/
3667 /*************************************************
3668 * Execute a Regular Expression *
3669 *************************************************/
3671 /* This function applies a compiled re to a subject string and picks out
3672 portions of the string if it matches. Two elements in the vector are set for
3673 each substring: the offsets to the start and end of the substring.
3676 argument_re points to the compiled expression
3677 extra_data points to extra data or is NULL
3678 subject points to the subject string
3679 length length of subject string (may contain binary zeros)
3680 start_offset where to start in the subject string
3682 offsets points to a vector of ints to be filled in with offsets
3683 offsetcount the number of elements in the vector
3685 Returns: > 0 => success; value is the number of elements filled in
3686 = 0 => success, but offsets is not big enough
3687 -1 => failed to match
3688 < -1 => some kind of unexpected problem
3692 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3693 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3696 int rc, resetcount, ocount;
3697 int first_byte = -1;
3701 unsigned long int ims;
3702 BOOL using_temporary_offsets = FALSE;
3706 BOOL first_byte_caseless = FALSE;
3707 BOOL req_byte_caseless = FALSE;
3709 match_data match_block;
3710 match_data *md = &match_block;
3711 const uschar *tables;
3712 const uschar *start_bits = NULL;
3713 USPTR start_match = (USPTR)subject + start_offset;
3715 USPTR req_byte_ptr = start_match - 1;
3716 eptrblock eptrchain[EPTR_WORK_SIZE];
3718 pcre_study_data internal_study;
3719 const pcre_study_data *study;
3721 real_pcre internal_re;
3722 const real_pcre *external_re = (const real_pcre *)argument_re;
3723 const real_pcre *re = external_re;
3725 /* Plausibility checks */
3727 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3728 if (re == NULL || subject == NULL ||
3729 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3730 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3732 /* Fish out the optional data from the extra_data structure, first setting
3733 the default values. */
3736 md->match_limit = MATCH_LIMIT;
3737 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3738 md->callout_data = NULL;
3740 /* The table pointer is always in native byte order. */
3742 tables = external_re->tables;
3744 if (extra_data != NULL)
3746 register unsigned int flags = extra_data->flags;
3747 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3748 study = (const pcre_study_data *)extra_data->study_data;
3749 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3750 md->match_limit = extra_data->match_limit;
3751 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3752 md->match_limit_recursion = extra_data->match_limit_recursion;
3753 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3754 md->callout_data = extra_data->callout_data;
3755 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3758 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3759 is a feature that makes it possible to save compiled regex and re-use them
3760 in other programs later. */
3762 if (tables == NULL) tables = _pcre_default_tables;
3764 /* Check that the first field in the block is the magic number. If it is not,
3765 test for a regex that was compiled on a host of opposite endianness. If this is
3766 the case, flipped values are put in internal_re and internal_study if there was
3769 if (re->magic_number != MAGIC_NUMBER)
3771 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3772 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3773 if (study != NULL) study = &internal_study;
3776 /* Set up other data */
3778 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3779 startline = (re->options & PCRE_STARTLINE) != 0;
3780 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3782 /* The code starts after the real_pcre block and the capture name table. */
3784 md->start_code = (const uschar *)external_re + re->name_table_offset +
3785 re->name_count * re->name_entry_size;
3787 md->start_subject = (USPTR)subject;
3788 md->start_offset = start_offset;
3789 md->end_subject = md->start_subject + length;
3790 end_subject = md->end_subject;
3792 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3793 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3795 md->notbol = (options & PCRE_NOTBOL) != 0;
3796 md->noteol = (options & PCRE_NOTEOL) != 0;
3797 md->notempty = (options & PCRE_NOTEMPTY) != 0;
3798 md->partial = (options & PCRE_PARTIAL) != 0;
3801 md->recursive = NULL; /* No recursion at top level */
3802 md->eptrchain = eptrchain; /* Make workspace generally available */
3804 md->lcc = tables + lcc_offset;
3805 md->ctypes = tables + ctypes_offset;
3807 /* Handle different types of newline. The two bits give four cases. If nothing
3808 is set at run time, whatever was used at compile time applies. */
3810 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
3813 case 0: newline = NEWLINE; break; /* Compile-time default */
3814 case PCRE_NEWLINE_CR: newline = '\r'; break;
3815 case PCRE_NEWLINE_LF: newline = '\n'; break;
3816 case PCRE_NEWLINE_CR+
3817 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3818 case PCRE_NEWLINE_ANY: newline = -1; break;
3819 default: return PCRE_ERROR_BADNEWLINE;
3824 md->nltype = NLTYPE_ANY;
3828 md->nltype = NLTYPE_FIXED;
3832 md->nl[0] = (newline >> 8) & 255;
3833 md->nl[1] = newline & 255;
3838 md->nl[0] = newline;
3842 /* Partial matching is supported only for a restricted set of regexes at the
3845 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3846 return PCRE_ERROR_BADPARTIAL;
3848 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3849 back the character offset. */
3852 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3854 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3855 return PCRE_ERROR_BADUTF8;
3856 if (start_offset > 0 && start_offset < length)
3858 int tb = ((uschar *)subject)[start_offset];
3862 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3868 /* The ims options can vary during the matching as a result of the presence
3869 of (?ims) items in the pattern. They are kept in a local variable so that
3870 restoring at the exit of a group is easy. */
3872 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3874 /* If the expression has got more back references than the offsets supplied can
3875 hold, we get a temporary chunk of working store to use during the matching.
3876 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3879 ocount = offsetcount - (offsetcount % 3);
3881 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3883 ocount = re->top_backref * 3 + 3;
3884 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3885 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3886 using_temporary_offsets = TRUE;
3887 DPRINTF(("Got memory to hold back references\n"));
3889 else md->offset_vector = offsets;
3891 md->offset_end = ocount;
3892 md->offset_max = (2*ocount)/3;
3893 md->offset_overflow = FALSE;
3894 md->capture_last = -1;
3896 /* Compute the minimum number of offsets that we need to reset each time. Doing
3897 this makes a huge difference to execution time when there aren't many brackets
3900 resetcount = 2 + re->top_bracket * 2;
3901 if (resetcount > offsetcount) resetcount = ocount;
3903 /* Reset the working variable associated with each extraction. These should
3904 never be used unless previously set, but they get saved and restored, and so we
3905 initialize them to avoid reading uninitialized locations. */
3907 if (md->offset_vector != NULL)
3909 register int *iptr = md->offset_vector + ocount;
3910 register int *iend = iptr - resetcount/2 + 1;
3911 while (--iptr >= iend) *iptr = -1;
3914 /* Set up the first character to match, if available. The first_byte value is
3915 never set for an anchored regular expression, but the anchoring may be forced
3916 at run time, so we have to test for anchoring. The first char may be unset for
3917 an unanchored pattern, of course. If there's no first char and the pattern was
3918 studied, there may be a bitmap of possible first characters. */
3922 if ((re->options & PCRE_FIRSTSET) != 0)
3924 first_byte = re->first_byte & 255;
3925 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3926 first_byte = md->lcc[first_byte];
3929 if (!startline && study != NULL &&
3930 (study->options & PCRE_STUDY_MAPPED) != 0)
3931 start_bits = study->start_bits;
3934 /* For anchored or unanchored matches, there may be a "last known required
3937 if ((re->options & PCRE_REQCHSET) != 0)
3939 req_byte = re->req_byte & 255;
3940 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3941 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3945 /* ==========================================================================*/
3947 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3948 the loop runs just once. */
3952 USPTR save_end_subject = end_subject;
3954 /* Reset the maximum number of extractions we might see. */
3956 if (md->offset_vector != NULL)
3958 register int *iptr = md->offset_vector;
3959 register int *iend = iptr + resetcount;
3960 while (iptr < iend) *iptr++ = -1;
3963 /* Advance to a unique first char if possible. If firstline is TRUE, the
3964 start of the match is constrained to the first line of a multiline string.
3965 That is, the match must be before or at the first newline. Implement this by
3966 temporarily adjusting end_subject so that we stop scanning at a newline. If
3967 the match fails at the newline, later code breaks this loop. */
3971 USPTR t = start_match;
3972 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3976 /* Now test for a unique first byte */
3978 if (first_byte >= 0)
3980 if (first_byte_caseless)
3981 while (start_match < end_subject &&
3982 md->lcc[*start_match] != first_byte)
3985 while (start_match < end_subject && *start_match != first_byte)
3989 /* Or to just after a linebreak for a multiline match if possible */
3993 if (start_match > md->start_subject + start_offset)
3995 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4000 /* Or to a non-unique first char after study */
4002 else if (start_bits != NULL)
4004 while (start_match < end_subject)
4006 register unsigned int c = *start_match;
4007 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4011 /* Restore fudged end_subject */
4013 end_subject = save_end_subject;
4015 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4016 printf(">>>> Match against: ");
4017 pchars(start_match, end_subject - start_match, TRUE, md);
4021 /* If req_byte is set, we know that that character must appear in the subject
4022 for the match to succeed. If the first character is set, req_byte must be
4023 later in the subject; otherwise the test starts at the match point. This
4024 optimization can save a huge amount of backtracking in patterns with nested
4025 unlimited repeats that aren't going to match. Writing separate code for
4026 cased/caseless versions makes it go faster, as does using an autoincrement
4027 and backing off on a match.
4029 HOWEVER: when the subject string is very, very long, searching to its end can
4030 take a long time, and give bad performance on quite ordinary patterns. This
4031 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4032 string... so we don't do this when the string is sufficiently long.
4034 ALSO: this processing is disabled when partial matching is requested.
4037 if (req_byte >= 0 &&
4038 end_subject - start_match < REQ_BYTE_MAX &&
4041 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4043 /* We don't need to repeat the search if we haven't yet reached the
4044 place we found it at last time. */
4046 if (p > req_byte_ptr)
4048 if (req_byte_caseless)
4050 while (p < end_subject)
4052 register int pp = *p++;
4053 if (pp == req_byte || pp == req_byte2) { p--; break; }
4058 while (p < end_subject)
4060 if (*p++ == req_byte) { p--; break; }
4064 /* If we can't find the required character, break the matching loop,
4065 forcing a match failure. */
4067 if (p >= end_subject)
4073 /* If we have found the required character, save the point where we
4074 found it, so that we don't search again next time round the loop if
4075 the start hasn't passed this character yet. */
4081 /* OK, we can now run the match. */
4083 md->start_match = start_match;
4084 md->match_call_count = 0;
4085 md->eptrn = 0; /* Next free eptrchain slot */
4086 rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
4088 /* Any return other than MATCH_NOMATCH breaks the loop. */
4090 if (rc != MATCH_NOMATCH) break;
4092 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4093 newline in the subject (though it may continue over the newline). Therefore,
4094 if we have just failed to match, starting at a newline, do not continue. */
4096 if (firstline && IS_NEWLINE(start_match)) break;
4098 /* Advance the match position by one character. */
4103 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4107 /* Break the loop if the pattern is anchored or if we have passed the end of
4110 if (anchored || start_match > end_subject) break;
4112 /* If we have just passed a CR and the newline option is CRLF or ANY, and we
4113 are now at a LF, advance the match position by one more character. */
4115 if (start_match[-1] == '\r' &&
4116 (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
4117 start_match < end_subject &&
4118 *start_match == '\n')
4121 } /* End of for(;;) "bumpalong" loop */
4123 /* ==========================================================================*/
4125 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4128 (1) The pattern is anchored;
4130 (2) We are past the end of the subject;
4132 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4133 this option requests that a match occur at or before the first newline in
4136 When we have a match and the offset vector is big enough to deal with any
4137 backreferences, captured substring offsets will already be set up. In the case
4138 where we had to get some local store to hold offsets for backreference
4139 processing, copy those that we can. In this case there need not be overflow if
4140 certain parts of the pattern were not used, even though there are more
4141 capturing parentheses than vector slots. */
4143 if (rc == MATCH_MATCH)
4145 if (using_temporary_offsets)
4147 if (offsetcount >= 4)
4149 memcpy(offsets + 2, md->offset_vector + 2,
4150 (offsetcount - 2) * sizeof(int));
4151 DPRINTF(("Copied offsets from temporary memory\n"));
4153 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4154 DPRINTF(("Freeing temporary memory\n"));
4155 (pcre_free)(md->offset_vector);
4158 /* Set the return code to the number of captured strings, or 0 if there are
4159 too many to fit into the vector. */
4161 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4163 /* If there is space, set up the whole thing as substring 0. */
4165 if (offsetcount < 2) rc = 0; else
4167 offsets[0] = start_match - md->start_subject;
4168 offsets[1] = md->end_match_ptr - md->start_subject;
4171 DPRINTF((">>>> returning %d\n", rc));
4175 /* Control gets here if there has been an error, or if the overall match
4176 attempt has failed at all permitted starting positions. */
4178 if (using_temporary_offsets)
4180 DPRINTF(("Freeing temporary memory\n"));
4181 (pcre_free)(md->offset_vector);
4184 if (rc != MATCH_NOMATCH)
4186 DPRINTF((">>>> error: returning %d\n", rc));
4189 else if (md->partial && md->hitend)
4191 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4192 return PCRE_ERROR_PARTIAL;
4196 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4197 return PCRE_ERROR_NOMATCH;
4201 /* End of pcre_exec.c */