-/* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.3 2006/11/07 16:50:36 ph10 Exp $ */
+/* $Cambridge: exim/src/src/pcre/pcre_exec.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */
/*************************************************
* Perl-Compatible Regular Expressions *
pattern matching using an NFA algorithm, trying to mimic Perl as closely as
possible. There are also some static supporting functions. */
-#define NLBLOCK md /* The block containing newline information */
-#include "pcre_internal.h"
+#define NLBLOCK md /* Block containing newline information */
+#define PSSTART start_subject /* Field containing processed string start */
+#define PSEND end_subject /* Field containing processed string end */
+#include "pcre_internal.h"
-/* Structure for building a chain of data that actually lives on the
-stack, for holding the values of the subject pointer at the start of each
-subpattern, so as to detect when an empty string has been matched by a
-subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
-are on the heap, not on the stack. */
+/* The chain of eptrblocks for tail recursions uses memory in stack workspace,
+obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
-typedef struct eptrblock {
- struct eptrblock *epb_prev;
- USPTR epb_saved_eptr;
-} eptrblock;
+#define EPTR_WORK_SIZE (1000)
/* Flag bits for the match() function */
-#define match_condassert 0x01 /* Called to check a condition assertion */
-#define match_isgroup 0x02 /* Set if start of bracketed group */
+#define match_condassert 0x01 /* Called to check a condition assertion */
+#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
+#define match_tail_recursed 0x04 /* Tail recursive call */
/* Non-error returns from the match() function. Error returns are externally
defined PCRE_ERROR_xxx codes, which are all negative. */
static void
pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
{
-int c;
+unsigned int c;
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
while (length-- > 0)
if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
BOOL Xcur_is_word;
BOOL Xcondition;
- BOOL Xminimize;
BOOL Xprev_is_word;
unsigned long int Xoriginal_ims;
int Xprop_category;
int Xprop_chartype;
int Xprop_script;
- int *Xprop_test_variable;
#endif
int Xctype;
- int Xfc;
+ unsigned int Xfc;
int Xfi;
int Xlength;
int Xmax;
* Match from current position *
*************************************************/
-/* On entry ecode points to the first opcode, and eptr to the first character
-in the subject string, while eptrb holds the value of eptr at the start of the
-last bracketed group - used for breaking infinite loops matching zero-length
-strings. This function is called recursively in many circumstances. Whenever it
+/* This function is called recursively in many circumstances. Whenever it
returns a negative (error) response, the outer incarnation must also return the
same response.
made performance worse.
Arguments:
- eptr pointer in subject
- ecode position in code
+ eptr pointer to current character in subject
+ ecode pointer to current position in compiled code
offset_top current top pointer
md pointer to "static" info for the match
ims current /i, /m, and /s options
brackets - for testing for empty matches
flags can contain
match_condassert - this is an assertion condition
- match_isgroup - this is the start of a bracketed group
+ match_cbegroup - this is the start of an unlimited repeat
+ group that can match an empty string
+ match_tail_recursed - this is a tail_recursed group
rdepth the recursion depth
Returns: MATCH_MATCH if matched ) these values are >= 0
int flags, unsigned int rdepth)
{
/* These variables do not need to be preserved over recursion in this function,
-so they can be ordinary variables in all cases. Mark them with "register"
-because they are used a lot in loops. */
+so they can be ordinary variables in all cases. Mark some of them with
+"register" because they are used a lot in loops. */
register int rrc; /* Returns from recursive calls */
register int i; /* Used for loops not involving calls to RMATCH() */
-register unsigned int c; /* Character values not kept over RMATCH() calls */
+register unsigned int c; /* Character values not kept over RMATCH() calls */
register BOOL utf8; /* Local copy of UTF-8 flag for speed */
+BOOL minimize, possessive; /* Quantifier options */
+
/* When recursion is not being used, all "local" variables that have to be
preserved over calls to RMATCH() are part of a "frame" which is obtained from
heap storage. Set up the top-level frame here; others are obtained from the
#define cur_is_word frame->Xcur_is_word
#define condition frame->Xcondition
-#define minimize frame->Xminimize
#define prev_is_word frame->Xprev_is_word
#define original_ims frame->Xoriginal_ims
#define prop_category frame->Xprop_category
#define prop_chartype frame->Xprop_chartype
#define prop_script frame->Xprop_script
-#define prop_test_variable frame->Xprop_test_variable
#endif
#define ctype frame->Xctype
get preserved during recursion in the normal way. In this environment, fi and
i, and fc and c, can be the same variables. */
-#else
+#else /* NO_RECURSE not defined */
#define fi i
#define fc c
/* that do not have to be preserved over */
BOOL cur_is_word; /* a recursive call to RMATCH(). */
BOOL condition;
-BOOL minimize;
BOOL prev_is_word;
unsigned long int original_ims;
int prop_category;
int prop_chartype;
int prop_script;
-int *prop_test_variable;
#endif
int ctype;
int stacksave[REC_STACK_SAVE_MAX];
eptrblock newptrb;
-#endif
+#endif /* NO_RECURSE */
/* These statements are here to stop the compiler complaining about unitialized
variables. */
#ifdef SUPPORT_UCP
prop_value = 0;
prop_fail_result = 0;
-prop_test_variable = NULL;
#endif
+
/* This label is used for tail recursion, which is used in a few cases even
when NO_RECURSE is not defined, in order to reduce the amount of stack that is
used. Thanks to Ian Taylor for noticing this possibility and sending the
utf8 = FALSE;
#endif
-/* At the start of a bracketed group, add the current subject pointer to the
-stack of such pointers, to be re-instated at the end of the group when we hit
-the closing ket. When match() is called in other circumstances, we don't add to
-this stack. */
+/* At the start of a group with an unlimited repeat that may match an empty
+string, the match_cbegroup flag is set. When this is the case, add the current
+subject pointer to the chain of such remembered pointers, to be checked when we
+hit the closing ket, in order to break infinite loops that match no characters.
+When match() is called in other circumstances, don't add to the chain. If this
+is a tail recursion, use a block from the workspace, as the one on the stack is
+already used. */
-if ((flags & match_isgroup) != 0)
+if ((flags & match_cbegroup) != 0)
{
- newptrb.epb_prev = eptrb;
- newptrb.epb_saved_eptr = eptr;
- eptrb = &newptrb;
+ eptrblock *p;
+ if ((flags & match_tail_recursed) != 0)
+ {
+ if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
+ p = md->eptrchain + md->eptrn++;
+ }
+ else p = &newptrb;
+ p->epb_saved_eptr = eptr;
+ p->epb_prev = eptrb;
+ eptrb = p;
}
-/* Now start processing the operations. */
+/* Now start processing the opcodes. */
for (;;)
{
+ minimize = possessive = FALSE;
op = *ecode;
- minimize = FALSE;
/* For partial matching, remember if we ever hit the end of the subject after
matching at least one subject character. */
eptr > md->start_match)
md->hitend = TRUE;
- /* Opening capturing bracket. If there is space in the offset vector, save
- the current subject position in the working slot at the top of the vector. We
- mustn't change the current values of the data slot, because they may be set
- from a previous iteration of this group, and be referred to by a reference
- inside the group.
-
- If the bracket fails to match, we need to restore this value and also the
- values of the final offsets, in case they were set by a previous iteration of
- the same bracket.
-
- If there isn't enough space in the offset vector, treat this as if it were a
- non-capturing bracket. Don't worry about setting the flag for the error case
- here; that is handled in the code for KET. */
-
- if (op > OP_BRA)
+ switch(op)
{
- number = op - OP_BRA;
-
- /* For extended extraction brackets (large number), we have to fish out the
- number from a dummy opcode at the start. */
-
- if (number > EXTRACT_BASIC_MAX)
- number = GET2(ecode, 2+LINK_SIZE);
+ /* Handle a capturing bracket. If there is space in the offset vector, save
+ the current subject position in the working slot at the top of the vector.
+ We mustn't change the current values of the data slot, because they may be
+ set from a previous iteration of this group, and be referred to by a
+ reference inside the group.
+
+ If the bracket fails to match, we need to restore this value and also the
+ values of the final offsets, in case they were set by a previous iteration
+ of the same bracket.
+
+ If there isn't enough space in the offset vector, treat this as if it were
+ a non-capturing bracket. Don't worry about setting the flag for the error
+ case here; that is handled in the code for KET. */
+
+ case OP_CBRA:
+ case OP_SCBRA:
+ number = GET2(ecode, 1+LINK_SIZE);
offset = number << 1;
#ifdef DEBUG
- printf("start bracket %d subject=", number);
+ printf("start bracket %d\n", number);
+ printf("subject=");
pchars(eptr, 16, TRUE, md);
printf("\n");
#endif
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
+ flags = (op == OP_SCBRA)? match_cbegroup : 0;
do
{
- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
- match_isgroup);
+ RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
+ ims, eptrb, flags);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
md->capture_last = save_capture_last;
ecode += GET(ecode, 1);
RRETURN(MATCH_NOMATCH);
}
- /* Insufficient room for saving captured contents */
+ /* Insufficient room for saving captured contents. Treat as a non-capturing
+ bracket. */
- else op = OP_BRA;
- }
+ DPRINTF(("insufficient capture room: treat as non-capturing\n"));
- /* Other types of node can be handled by a switch */
-
- switch(op)
- {
- case OP_BRA: /* Non-capturing bracket: optimized */
- DPRINTF(("start bracket 0\n"));
-
- /* Loop for all the alternatives */
+ /* Non-capturing bracket. Loop for all the alternatives. When we get to the
+ final alternative within the brackets, we would return the result of a
+ recursive call to match() whatever happened. We can reduce stack usage by
+ turning this into a tail recursion. */
+ case OP_BRA:
+ case OP_SBRA:
+ DPRINTF(("start non-capturing bracket\n"));
+ flags = (op >= OP_SBRA)? match_cbegroup : 0;
for (;;)
{
- /* When we get to the final alternative within the brackets, we would
- return the result of a recursive call to match() whatever happened. We
- can reduce stack usage by turning this into a tail recursion. */
-
if (ecode[GET(ecode, 1)] != OP_ALT)
- {
- ecode += 1 + LINK_SIZE;
- flags = match_isgroup;
- DPRINTF(("bracket 0 tail recursion\n"));
- goto TAIL_RECURSE;
- }
+ {
+ ecode += _pcre_OP_lengths[*ecode];
+ flags |= match_tail_recursed;
+ DPRINTF(("bracket 0 tail recursion\n"));
+ goto TAIL_RECURSE;
+ }
/* For non-final alternatives, continue the loop for a NOMATCH result;
otherwise return. */
- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
- match_isgroup);
+ RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
+ eptrb, flags);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode, 1);
}
obeyed, we can use tail recursion to avoid using another stack frame. */
case OP_COND:
- if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
+ case OP_SCOND:
+ if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
+ {
+ offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
+ condition = md->recursive != NULL &&
+ (offset == RREF_ANY || offset == md->recursive->group_num);
+ ecode += condition? 3 : GET(ecode, 1);
+ }
+
+ else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
{
offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
- condition = (offset == CREF_RECURSE * 2)?
- (md->recursive != NULL) :
- (offset < offset_top && md->offset_vector[offset] >= 0);
- ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1));
- flags = match_isgroup;
- goto TAIL_RECURSE;
+ condition = offset < offset_top && md->offset_vector[offset] >= 0;
+ ecode += condition? 3 : GET(ecode, 1);
+ }
+
+ else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
+ {
+ condition = FALSE;
+ ecode += GET(ecode, 1);
}
/* The condition is an assertion. Call match() to evaluate it - setting
- the final argument TRUE causes it to stop at the end of an assertion. */
+ the final argument match_condassert causes it to stop at the end of an
+ assertion. */
else
{
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
- match_condassert | match_isgroup);
+ match_condassert);
if (rrc == MATCH_MATCH)
{
- ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
+ condition = TRUE;
+ ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
while (*ecode == OP_ALT) ecode += GET(ecode, 1);
}
else if (rrc != MATCH_NOMATCH)
{
RRETURN(rrc); /* Need braces because of following else */
}
- else ecode += GET(ecode, 1);
+ else
+ {
+ condition = FALSE;
+ ecode += GET(ecode, 1);
+ }
+ }
- /* We are now at the branch that is to be obeyed. As there is only one,
- we can use tail recursion to avoid using another stack frame. */
+ /* We are now at the branch that is to be obeyed. As there is only one,
+ we can use tail recursion to avoid using another stack frame. If the second
+ alternative doesn't exist, we can just plough on. */
+ if (condition || *ecode == OP_ALT)
+ {
ecode += 1 + LINK_SIZE;
- flags = match_isgroup;
+ flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
goto TAIL_RECURSE;
}
- /* Control never reaches here */
-
- /* Skip over conditional reference or large extraction number data if
- encountered. */
-
- case OP_CREF:
- case OP_BRANUMBER:
- ecode += 3;
+ else
+ {
+ ecode += 1 + LINK_SIZE;
+ }
break;
- /* End of the pattern. If we are in a recursion, we should restore the
- offsets appropriately and continue from after the call. */
+
+ /* End of the pattern. If we are in a top-level recursion, we should
+ restore the offsets appropriately and continue from after the call. */
case OP_END:
if (md->recursive != NULL && md->recursive->group_num == 0)
case OP_ASSERTBACK:
do
{
- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
- match_isgroup);
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
if (rrc == MATCH_MATCH) break;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode, 1);
case OP_ASSERTBACK_NOT:
do
{
- RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
- match_isgroup);
+ RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode,1);
#ifdef SUPPORT_UTF8
if (utf8)
{
- c = GET(ecode,1);
- for (i = 0; i < c; i++)
+ i = GET(ecode, 1);
+ while (i-- > 0)
{
eptr--;
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
/* No UTF-8 support, or not in UTF-8 mode: count is byte count */
{
- eptr -= GET(ecode,1);
+ eptr -= GET(ecode, 1);
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
}
case OP_RECURSE:
{
callpat = md->start_code + GET(ecode, 1);
- new_recursive.group_num = *callpat - OP_BRA;
-
- /* For extended extraction brackets (large number), we have to fish out
- the number from a dummy opcode at the start. */
-
- if (new_recursive.group_num > EXTRACT_BASIC_MAX)
- new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
+ new_recursive.group_num = (callpat == md->start_code)? 0 :
+ GET2(callpat, 1 + LINK_SIZE);
/* Add to "recursing stack" */
restore the offset and recursion data. */
DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
+ flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
do
{
- RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
- eptrb, match_isgroup);
+ RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
+ md, ims, eptrb, flags);
if (rrc == MATCH_MATCH)
{
DPRINTF(("Recursion matched\n"));
do
{
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
- eptrb, match_isgroup);
+ eptrb, 0);
if (rrc == MATCH_MATCH) break;
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += GET(ecode,1);
/* Continue as from after the assertion, updating the offsets high water
mark, since extracts may have been taken. */
- do ecode += GET(ecode,1); while (*ecode == OP_ALT);
+ do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
offset_top = md->end_offset_top;
eptr = md->end_match_ptr;
RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode = prev;
- flags = match_isgroup;
+ flags = match_tail_recursed;
goto TAIL_RECURSE;
}
else /* OP_KETRMAX */
{
- RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += 1 + LINK_SIZE;
- flags = 0;
+ flags = match_tail_recursed;
goto TAIL_RECURSE;
}
/* Control never gets here */
case OP_BRAZERO:
{
next = ecode+1;
- RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
+ RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
do next += GET(next,1); while (*next == OP_ALT);
- ecode = next + 1+LINK_SIZE;
+ ecode = next + 1 + LINK_SIZE;
}
break;
case OP_BRAMINZERO:
{
next = ecode+1;
- do next += GET(next,1); while (*next == OP_ALT);
- RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
- match_isgroup);
+ do next += GET(next, 1); while (*next == OP_ALT);
+ RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode++;
}
break;
- /* End of a group, repeated or non-repeating. If we are at the end of
- an assertion "group", stop matching and return MATCH_MATCH, but record the
- current high water mark for use by positive assertions. Do this also
- for the "once" (not-backup up) groups. */
+ /* End of a group, repeated or non-repeating. */
case OP_KET:
case OP_KETRMIN:
case OP_KETRMAX:
prev = ecode - GET(ecode, 1);
- saved_eptr = eptrb->epb_saved_eptr;
- /* Back up the stack of bracket start pointers. */
+ /* If this was a group that remembered the subject start, in order to break
+ infinite repeats of empty string matches, retrieve the subject start from
+ the chain. Otherwise, set it NULL. */
+
+ if (*prev >= OP_SBRA)
+ {
+ saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
+ eptrb = eptrb->epb_prev; /* Backup to previous group */
+ }
+ else saved_eptr = NULL;
- eptrb = eptrb->epb_prev;
+ /* If we are at the end of an assertion group, stop matching and return
+ MATCH_MATCH, but record the current high water mark for use by positive
+ assertions. Do this also for the "once" (atomic) groups. */
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
RRETURN(MATCH_MATCH);
}
- /* In all other cases except a conditional group we have to check the
- group number back at the start and if necessary complete handling an
- extraction by setting the offsets and bumping the high water mark. */
+ /* For capturing groups we have to check the group number back at the start
+ and if necessary complete handling an extraction by setting the offsets and
+ bumping the high water mark. Note that whole-pattern recursion is coded as
+ a recurse into group 0, so it won't be picked up here. Instead, we catch it
+ when the OP_END is reached. Other recursion is handled here. */
- if (*prev != OP_COND)
+ if (*prev == OP_CBRA || *prev == OP_SCBRA)
{
- number = *prev - OP_BRA;
-
- /* For extended extraction brackets (large number), we have to fish out
- the number from a dummy opcode at the start. */
-
- if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
+ number = GET2(prev, 1+LINK_SIZE);
offset = number << 1;
#ifdef DEBUG
printf("\n");
#endif
- /* Test for a numbered group. This includes groups called as a result
- of recursion. Note that whole-pattern recursion is coded as a recurse
- into group 0, so it won't be picked up here. Instead, we catch it when
- the OP_END is reached. */
-
- if (number > 0)
+ md->capture_last = number;
+ if (offset >= md->offset_max) md->offset_overflow = TRUE; else
{
- md->capture_last = number;
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
- {
- md->offset_vector[offset] =
- md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
- if (offset_top <= offset) offset_top = offset + 2;
- }
+ md->offset_vector[offset] =
+ md->offset_vector[md->offset_end - number];
+ md->offset_vector[offset+1] = eptr - md->start_subject;
+ if (offset_top <= offset) offset_top = offset + 2;
+ }
- /* Handle a recursively called group. Restore the offsets
- appropriately and continue from after the call. */
+ /* Handle a recursively called group. Restore the offsets
+ appropriately and continue from after the call. */
- if (md->recursive != NULL && md->recursive->group_num == number)
- {
- recursion_info *rec = md->recursive;
- DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
- md->recursive = rec->prevrec;
- md->start_match = rec->save_start;
- memcpy(md->offset_vector, rec->offset_save,
- rec->saved_max * sizeof(int));
- ecode = rec->after_call;
- ims = original_ims;
- break;
- }
+ if (md->recursive != NULL && md->recursive->group_num == number)
+ {
+ recursion_info *rec = md->recursive;
+ DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
+ md->recursive = rec->prevrec;
+ md->start_match = rec->save_start;
+ memcpy(md->offset_vector, rec->offset_save,
+ rec->saved_max * sizeof(int));
+ ecode = rec->after_call;
+ ims = original_ims;
+ break;
}
}
- /* Reset the value of the ims flags, in case they got changed during
- the group. */
+ /* For both capturing and non-capturing groups, reset the value of the ims
+ flags, in case they got changed during the group. */
ims = original_ims;
DPRINTF(("ims reset to %02lx\n", ims));
preceding bracket, in the appropriate order. In the second case, we can use
tail recursion to avoid using another stack frame. */
+ flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
+
if (*ecode == OP_KETRMIN)
{
RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode = prev;
- flags = match_isgroup;
+ flags |= match_tail_recursed;
goto TAIL_RECURSE;
}
else /* OP_KETRMAX */
{
- RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
+ RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
ecode += 1 + LINK_SIZE;
- flags = 0;
+ flags = match_tail_recursed;
goto TAIL_RECURSE;
}
/* Control never gets here */
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr != md->start_subject &&
- (eptr == md->end_subject ||
- eptr < md->start_subject + md->nllen ||
- !IS_NEWLINE(eptr - md->nllen)))
+ (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
ecode++;
break;
if (!md->endonly)
{
if (eptr != md->end_subject &&
- (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
+ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
RRETURN(MATCH_NOMATCH);
ecode++;
break;
case OP_EODN:
if (eptr != md->end_subject &&
- (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
+ (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
RRETURN(MATCH_NOMATCH);
ecode++;
break;
case OP_ANY:
if ((ims & PCRE_DOTALL) == 0)
{
- if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
- RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
}
if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
if (utf8)
ecode++;
break;
+ case OP_ANYNL:
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINCTEST(c, eptr);
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ break;
+ }
+ ecode++;
+ break;
+
#ifdef SUPPORT_UCP
/* Check the next character by Unicode property. We will get here only
if the support is in the binary; otherwise a compile-time error occurs. */
default:
RRETURN(PCRE_ERROR_INTERNAL);
- break;
}
ecode += 3;
else
{
- int dc;
+ unsigned int dc;
GETCHARINC(dc, eptr);
ecode += length;
}
break;
- /* Match a single character repeatedly; different opcodes share code. */
+ /* Match a single character repeatedly. */
case OP_EXACT:
min = max = GET2(ecode, 1);
ecode += 3;
goto REPEATCHAR;
+ case OP_POSUPTO:
+ possessive = TRUE;
+ /* Fall through */
+
case OP_UPTO:
case OP_MINUPTO:
min = 0;
ecode += 3;
goto REPEATCHAR;
+ case OP_POSSTAR:
+ possessive = TRUE;
+ min = 0;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATCHAR;
+
+ case OP_POSPLUS:
+ possessive = TRUE;
+ min = 1;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATCHAR;
+
+ case OP_POSQUERY:
+ possessive = TRUE;
+ min = 0;
+ max = 1;
+ ecode++;
+ goto REPEATCHAR;
+
case OP_STAR:
case OP_MINSTAR:
case OP_PLUS:
uschar occhars[8];
#ifdef SUPPORT_UCP
- int othercase;
+ unsigned int othercase;
if ((ims & PCRE_CASELESS) != 0 &&
- (othercase = _pcre_ucp_othercase(fc)) >= 0 &&
- othercase >= 0)
+ (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
oclength = _pcre_ord2utf8(othercase, occhars);
#endif /* SUPPORT_UCP */
}
/* Control never gets here */
}
- else
+
+ else /* Maximize */
{
pp = eptr;
for (i = min; i < max; i++)
eptr += oclength;
}
}
+
+ if (possessive) continue;
while (eptr >= pp)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
}
/* Control never gets here */
}
- else
+ else /* Maximize */
{
pp = eptr;
for (i = min; i < max; i++)
if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
eptr++;
}
+ if (possessive) continue;
while (eptr >= pp)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
}
/* Control never gets here */
}
- else
+ else /* Maximize */
{
pp = eptr;
for (i = min; i < max; i++)
if (eptr >= md->end_subject || fc != *eptr) break;
eptr++;
}
+ if (possessive) continue;
while (eptr >= pp)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
ecode += 3;
goto REPEATNOTCHAR;
+ case OP_NOTPOSSTAR:
+ possessive = TRUE;
+ min = 0;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTPOSPLUS:
+ possessive = TRUE;
+ min = 1;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTPOSQUERY:
+ possessive = TRUE;
+ min = 0;
+ max = 1;
+ ecode++;
+ goto REPEATNOTCHAR;
+
+ case OP_NOTPOSUPTO:
+ possessive = TRUE;
+ min = 0;
+ max = GET2(ecode, 1);
+ ecode += 3;
+ goto REPEATNOTCHAR;
+
case OP_NOTSTAR:
case OP_NOTMINSTAR:
case OP_NOTPLUS:
/* UTF-8 mode */
if (utf8)
{
- register int d;
+ register unsigned int d;
for (i = 1; i <= min; i++)
{
GETCHARINC(d, eptr);
/* UTF-8 mode */
if (utf8)
{
- register int d;
+ register unsigned int d;
for (fi = min;; fi++)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
/* UTF-8 mode */
if (utf8)
{
- register int d;
+ register unsigned int d;
for (i = min; i < max; i++)
{
int len = 1;
if (fc == d) break;
eptr += len;
}
- for(;;)
+ if (possessive) continue;
+ for(;;)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
eptr++;
}
+ if (possessive) continue;
while (eptr >= pp)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
/* UTF-8 mode */
if (utf8)
{
- register int d;
+ register unsigned int d;
for (i = 1; i <= min; i++)
{
GETCHARINC(d, eptr);
/* UTF-8 mode */
if (utf8)
{
- register int d;
+ register unsigned int d;
for (fi = min;; fi++)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
/* UTF-8 mode */
if (utf8)
{
- register int d;
+ register unsigned int d;
for (i = min; i < max; i++)
{
int len = 1;
if (fc == d) break;
eptr += len;
}
+ if (possessive) continue;
for(;;)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
if (eptr >= md->end_subject || fc == *eptr) break;
eptr++;
}
+ if (possessive) continue;
while (eptr >= pp)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
ecode += 3;
goto REPEATTYPE;
+ case OP_TYPEPOSSTAR:
+ possessive = TRUE;
+ min = 0;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATTYPE;
+
+ case OP_TYPEPOSPLUS:
+ possessive = TRUE;
+ min = 1;
+ max = INT_MAX;
+ ecode++;
+ goto REPEATTYPE;
+
+ case OP_TYPEPOSQUERY:
+ possessive = TRUE;
+ min = 0;
+ max = 1;
+ ecode++;
+ goto REPEATTYPE;
+
+ case OP_TYPEPOSUPTO:
+ possessive = TRUE;
+ min = 0;
+ max = GET2(ecode, 1);
+ ecode += 3;
+ goto REPEATTYPE;
+
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPLUS:
default:
RRETURN(PCRE_ERROR_INTERNAL);
- break;
}
}
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 &&
- eptr <= md->end_subject - md->nllen &&
- IS_NEWLINE(eptr)))
+ ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
eptr += min;
break;
+ case OP_ANYNL:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ GETCHARINC(c, eptr);
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ break;
+ }
+ }
+ break;
+
case OP_NOT_DIGIT:
for (i = 1; i <= min; i++)
{
#endif /* SUPPORT_UTF8 */
/* Code for the non-UTF-8 case for minimum matching of operators other
- than OP_PROP and OP_NOTPROP. */
+ than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
+ number of bytes present, as this was tested above. */
switch(ctype)
{
{
for (i = 1; i <= min; i++)
{
- if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
- RRETURN(MATCH_NOMATCH);
+ if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
eptr++;
}
}
eptr += min;
break;
+ /* Because of the CRLF case, we can't assume the minimum number of
+ bytes are present in this case. */
+
+ case OP_ANYNL:
+ for (i = 1; i <= min; i++)
+ {
+ if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
+ switch(*eptr++)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ break;
+ }
+ }
+ break;
+
case OP_NOT_DIGIT:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
if (prop_fail_result) RRETURN(MATCH_NOMATCH);
}
- break;
+ /* Control never gets here */
case PT_LAMP:
for (fi = min;; fi++)
prop_chartype == ucp_Lt) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
- break;
+ /* Control never gets here */
case PT_GC:
for (fi = min;; fi++)
if ((prop_category == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
- break;
+ /* Control never gets here */
case PT_PC:
for (fi = min;; fi++)
if ((prop_chartype == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
- break;
+ /* Control never gets here */
case PT_SC:
for (fi = min;; fi++)
if ((prop_script == prop_value) == prop_fail_result)
RRETURN(MATCH_NOMATCH);
}
- break;
+ /* Control never gets here */
default:
RRETURN(PCRE_ERROR_INTERNAL);
- break;
}
}
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
(ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
- eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
+ IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
GETCHARINC(c, eptr);
case OP_ANYBYTE:
break;
+ case OP_ANYNL:
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ case 0x2028:
+ case 0x2029:
+ break;
+ }
+ break;
+
case OP_NOT_DIGIT:
if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
RRETURN(MATCH_NOMATCH);
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
if (rrc != MATCH_NOMATCH) RRETURN(rrc);
if (fi >= max || eptr >= md->end_subject ||
- ((ims & PCRE_DOTALL) == 0 &&
- eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
+ ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
RRETURN(MATCH_NOMATCH);
c = *eptr++;
case OP_ANYBYTE:
break;
+ case OP_ANYNL:
+ switch(c)
+ {
+ default: RRETURN(MATCH_NOMATCH);
+ case 0x000d:
+ if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
+ break;
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x0085:
+ break;
+ }
+ break;
+
case OP_NOT_DIGIT:
if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
break;
/* Control never gets here */
}
- /* If maximizing it is worth using inline code for speed, doing the type
+ /* If maximizing, it is worth using inline code for speed, doing the type
test once at the start (i.e. keep it out of the loop). Again, keep the
UTF-8 and UCP stuff separate. */
/* eptr is now past the end of the maximum run */
+ if (possessive) continue;
for(;;)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
/* eptr is now past the end of the maximum run */
+ if (possessive) continue;
for(;;)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
{
for (i = min; i < max; i++)
{
- if (eptr >= md->end_subject ||
- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
- break;
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
{
for (i = min; i < max; i++)
{
- if (eptr >= md->end_subject ||
- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
- break;
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
eptr++;
}
break;
else
{
c = max - min;
- if (c > md->end_subject - eptr) c = md->end_subject - eptr;
+ if (c > (unsigned int)(md->end_subject - eptr))
+ c = md->end_subject - eptr;
eptr += c;
}
}
case OP_ANYBYTE:
c = max - min;
- if (c > md->end_subject - eptr) c = md->end_subject - eptr;
+ if (c > (unsigned int)(md->end_subject - eptr))
+ c = md->end_subject - eptr;
eptr += c;
break;
+ case OP_ANYNL:
+ for (i = min; i < max; i++)
+ {
+ int len = 1;
+ if (eptr >= md->end_subject) break;
+ GETCHARLEN(c, eptr, len);
+ if (c == 0x000d)
+ {
+ if (++eptr >= md->end_subject) break;
+ if (*eptr == 0x000a) eptr++;
+ }
+ else
+ {
+ if (c != 0x000a && c != 0x000b && c != 0x000c &&
+ c != 0x0085 && c != 0x2028 && c != 0x2029)
+ break;
+ eptr += len;
+ }
+ }
+ break;
+
case OP_NOT_DIGIT:
for (i = min; i < max; i++)
{
/* eptr is now past the end of the maximum run */
+ if (possessive) continue;
for(;;)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
{
for (i = min; i < max; i++)
{
- if (eptr >= md->end_subject ||
- (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
- break;
+ if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
eptr++;
}
break;
case OP_ANYBYTE:
c = max - min;
- if (c > md->end_subject - eptr) c = md->end_subject - eptr;
+ if (c > (unsigned int)(md->end_subject - eptr))
+ c = md->end_subject - eptr;
eptr += c;
break;
+ case OP_ANYNL:
+ for (i = min; i < max; i++)
+ {
+ if (eptr >= md->end_subject) break;
+ c = *eptr;
+ if (c == 0x000d)
+ {
+ if (++eptr >= md->end_subject) break;
+ if (*eptr == 0x000a) eptr++;
+ }
+ else
+ {
+ if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
+ break;
+ eptr++;
+ }
+ }
+ break;
+
case OP_NOT_DIGIT:
for (i = min; i < max; i++)
{
/* eptr is now past the end of the maximum run */
+ if (possessive) continue;
while (eptr >= pp)
{
RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
}
/* Control never gets here */
- /* There's been some horrible disaster. Since all codes > OP_BRA are
- for capturing brackets, and there shouldn't be any gaps between 0 and
- OP_BRA, arrival here can only mean there is something seriously wrong
- in the code above or the OP_xxx definitions. */
+ /* There's been some horrible disaster. Arrival here can only mean there is
+ something seriously wrong in the code above or the OP_xxx definitions. */
default:
DPRINTF(("Unknown opcode %d\n", *ecode));
- RRETURN(PCRE_ERROR_UNKNOWN_NODE);
+ RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
}
/* Do not stick any code in here without much thought; it is assumed
#undef cur_is_word
#undef condition
-#undef minimize
#undef prev_is_word
#undef original_ims
BOOL firstline;
BOOL first_byte_caseless = FALSE;
BOOL req_byte_caseless = FALSE;
+BOOL utf8;
match_data match_block;
match_data *md = &match_block;
const uschar *tables;
USPTR start_match = (USPTR)subject + start_offset;
USPTR end_subject;
USPTR req_byte_ptr = start_match - 1;
+eptrblock eptrchain[EPTR_WORK_SIZE];
pcre_study_data internal_study;
const pcre_study_data *study;
end_subject = md->end_subject;
md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
-md->utf8 = (re->options & PCRE_UTF8) != 0;
+utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
md->notbol = (options & PCRE_NOTBOL) != 0;
md->noteol = (options & PCRE_NOTEOL) != 0;
md->hitend = FALSE;
md->recursive = NULL; /* No recursion at top level */
+md->eptrchain = eptrchain; /* Make workspace generally available */
md->lcc = tables + lcc_offset;
md->ctypes = tables + ctypes_offset;
/* Handle different types of newline. The two bits give four cases. If nothing
is set at run time, whatever was used at compile time applies. */
-switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &
- PCRE_NEWLINE_CRLF)
+switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
+ PCRE_NEWLINE_BITS)
{
- default: newline = NEWLINE; break; /* Compile-time default */
+ case 0: newline = NEWLINE; break; /* Compile-time default */
case PCRE_NEWLINE_CR: newline = '\r'; break;
case PCRE_NEWLINE_LF: newline = '\n'; break;
case PCRE_NEWLINE_CR+
PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
+ case PCRE_NEWLINE_ANY: newline = -1; break;
+ default: return PCRE_ERROR_BADNEWLINE;
}
-if (newline > 255)
+if (newline < 0)
{
- md->nllen = 2;
- md->nl[0] = (newline >> 8) & 255;
- md->nl[1] = newline & 255;
+ md->nltype = NLTYPE_ANY;
}
else
{
- md->nllen = 1;
- md->nl[0] = newline;
+ md->nltype = NLTYPE_FIXED;
+ if (newline > 255)
+ {
+ md->nllen = 2;
+ md->nl[0] = (newline >> 8) & 255;
+ md->nl[1] = newline & 255;
+ }
+ else
+ {
+ md->nllen = 1;
+ md->nl[0] = newline;
+ }
}
/* Partial matching is supported only for a restricted set of regexes at the
back the character offset. */
#ifdef SUPPORT_UTF8
-if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
+if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
{
if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
return PCRE_ERROR_BADUTF8;
req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
}
+
+/* ==========================================================================*/
+
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
the loop runs just once. */
-do
+for(;;)
{
USPTR save_end_subject = end_subject;
/* Advance to a unique first char if possible. If firstline is TRUE, the
start of the match is constrained to the first line of a multiline string.
- Implement this by temporarily adjusting end_subject so that we stop scanning
- at a newline. If the match fails at the newline, later code breaks this loop.
- */
+ That is, the match must be before or at the first newline. Implement this by
+ temporarily adjusting end_subject so that we stop scanning at a newline. If
+ the match fails at the newline, later code breaks this loop. */
if (firstline)
{
USPTR t = start_match;
- while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;
+ while (t < md->end_subject && !IS_NEWLINE(t)) t++;
end_subject = t;
}
else if (startline)
{
- if (start_match >= md->start_subject + md->nllen +
- start_offset)
+ if (start_match > md->start_subject + start_offset)
{
- while (start_match <= end_subject &&
- !IS_NEWLINE(start_match - md->nllen))
+ while (start_match <= end_subject && !WAS_NEWLINE(start_match))
start_match++;
}
}
HOWEVER: when the subject string is very, very long, searching to its end can
take a long time, and give bad performance on quite ordinary patterns. This
- showed up when somebody was matching /^C/ on a 32-megabyte string... so we
- don't do this when the string is sufficiently long.
+ showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
+ string... so we don't do this when the string is sufficiently long.
ALSO: this processing is disabled when partial matching is requested.
*/
}
}
- /* If we can't find the required character, break the matching loop */
+ /* If we can't find the required character, break the matching loop,
+ forcing a match failure. */
- if (p >= end_subject) break;
+ if (p >= end_subject)
+ {
+ rc = MATCH_NOMATCH;
+ break;
+ }
/* If we have found the required character, save the point where we
found it, so that we don't search again next time round the loop if
}
}
- /* When a match occurs, substrings will be set for all internal extractions;
- we just need to set up the whole thing as substring 0 before returning. If
- there were too many extractions, set the return code to zero. In the case
- where we had to get some local store to hold offsets for backreferences, copy
- those back references that we can. In this case there need not be overflow
- if certain parts of the pattern were not used. */
+ /* OK, we can now run the match. */
md->start_match = start_match;
md->match_call_count = 0;
+ md->eptrn = 0; /* Next free eptrchain slot */
+ rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
- rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0);
+ /* Any return other than MATCH_NOMATCH breaks the loop. */
- /* When the result is no match, if the subject's first character was a
- newline and the PCRE_FIRSTLINE option is set, break (which will return
- PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
- newline in the subject. Otherwise, advance the pointer to the next character
- and continue - but the continuation will actually happen only when the
- pattern is not anchored. */
+ if (rc != MATCH_NOMATCH) break;
- if (rc == MATCH_NOMATCH)
- {
- if (firstline &&
- start_match <= md->end_subject - md->nllen &&
- IS_NEWLINE(start_match))
- break;
- start_match++;
+ /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
+ newline in the subject (though it may continue over the newline). Therefore,
+ if we have just failed to match, starting at a newline, do not continue. */
+
+ if (firstline && IS_NEWLINE(start_match)) break;
+
+ /* Advance the match position by one character. */
+
+ start_match++;
#ifdef SUPPORT_UTF8
- if (md->utf8)
- while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
- start_match++;
+ if (utf8)
+ while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
+ start_match++;
#endif
- continue;
- }
- if (rc != MATCH_MATCH)
- {
- DPRINTF((">>>> error: returning %d\n", rc));
- return rc;
- }
+ /* Break the loop if the pattern is anchored or if we have passed the end of
+ the subject. */
+
+ if (anchored || start_match > end_subject) break;
+
+ /* If we have just passed a CR and the newline option is CRLF or ANY, and we
+ are now at a LF, advance the match position by one more character. */
+
+ if (start_match[-1] == '\r' &&
+ (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
+ start_match < end_subject &&
+ *start_match == '\n')
+ start_match++;
+
+ } /* End of for(;;) "bumpalong" loop */
+
+/* ==========================================================================*/
+
+/* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
+conditions is true:
- /* We have a match! Copy the offset information from temporary store if
- necessary */
+(1) The pattern is anchored;
+(2) We are past the end of the subject;
+
+(3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
+ this option requests that a match occur at or before the first newline in
+ the subject.
+
+When we have a match and the offset vector is big enough to deal with any
+backreferences, captured substring offsets will already be set up. In the case
+where we had to get some local store to hold offsets for backreference
+processing, copy those that we can. In this case there need not be overflow if
+certain parts of the pattern were not used, even though there are more
+capturing parentheses than vector slots. */
+
+if (rc == MATCH_MATCH)
+ {
if (using_temporary_offsets)
{
if (offsetcount >= 4)
(offsetcount - 2) * sizeof(int));
DPRINTF(("Copied offsets from temporary memory\n"));
}
- if (md->end_offset_top > offsetcount)
- md->offset_overflow = TRUE;
-
+ if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
DPRINTF(("Freeing temporary memory\n"));
(pcre_free)(md->offset_vector);
}
+ /* Set the return code to the number of captured strings, or 0 if there are
+ too many to fit into the vector. */
+
rc = md->offset_overflow? 0 : md->end_offset_top/2;
+ /* If there is space, set up the whole thing as substring 0. */
+
if (offsetcount < 2) rc = 0; else
{
offsets[0] = start_match - md->start_subject;
return rc;
}
-/* This "while" is the end of the "do" above */
-
-while (!anchored && start_match <= end_subject);
+/* Control gets here if there has been an error, or if the overall match
+attempt has failed at all permitted starting positions. */
if (using_temporary_offsets)
{
(pcre_free)(md->offset_vector);
}
-if (md->partial && md->hitend)
+if (rc != MATCH_NOMATCH)
+ {
+ DPRINTF((">>>> error: returning %d\n", rc));
+ return rc;
+ }
+else if (md->partial && md->hitend)
{
DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
return PCRE_ERROR_PARTIAL;