X-Git-Url: https://git.exim.org/exim.git/blobdiff_plain/1cc59d374be28e0f2a27720d003271216676c12e..6bf342e1eb87c656de34d668d7303a7d0138e89a:/src/src/pcre/pcre_study.c diff --git a/src/src/pcre/pcre_study.c b/src/src/pcre/pcre_study.c index 62990940a..ae312f272 100644 --- a/src/src/pcre/pcre_study.c +++ b/src/src/pcre/pcre_study.c @@ -1,4 +1,4 @@ -/* $Cambridge: exim/src/src/pcre/pcre_study.c,v 1.3 2006/11/07 16:50:36 ph10 Exp $ */ +/* $Cambridge: exim/src/src/pcre/pcre_study.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */ /************************************************* * Perl-Compatible Regular Expressions * @@ -47,6 +47,11 @@ supporting functions. */ #include "pcre_internal.h" +/* Returns from set_start_bits() */ + +enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE }; + + /************************************************* * Set a bit and maybe its alternate case * *************************************************/ @@ -74,12 +79,16 @@ if (caseless && (cd->ctypes[c] & ctype_letter) != 0) /************************************************* -* Create bitmap of starting chars * +* Create bitmap of starting bytes * *************************************************/ -/* This function scans a compiled unanchored expression and attempts to build a -bitmap of the set of initial characters. If it can't, it returns FALSE. As time -goes by, we may be able to get more clever at doing this. +/* This function scans a compiled unanchored expression recursively and +attempts to build a bitmap of the set of possible starting bytes. As time goes +by, we may be able to get more clever at doing this. The SSB_CONTINUE return is +useful for parenthesized groups in patterns such as (a*)b where the group +provides some optional starting bytes but scanning must continue at the outer +level to find at least one mandatory byte. At the outermost level, this +function fails unless the result is SSB_DONE. Arguments: code points to an expression @@ -88,14 +97,17 @@ Arguments: utf8 TRUE if in UTF-8 mode cd the block with char table pointers -Returns: TRUE if table built, FALSE otherwise +Returns: SSB_FAIL => Failed to find any starting bytes + SSB_DONE => Found mandatory starting bytes + SSB_CONTINUE => Found optional starting bytes */ -static BOOL +static int set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless, BOOL utf8, compile_data *cd) { register int c; +int yield = SSB_DONE; #if 0 /* ========================================================================= */ @@ -116,36 +128,60 @@ volatile int dummy; do { - const uschar *tcode = code + 1 + LINK_SIZE; + const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE; BOOL try_next = TRUE; - while (try_next) + while (try_next) /* Loop for items in this branch */ { - /* If a branch starts with a bracket or a positive lookahead assertion, - recurse to set bits from within them. That's all for this branch. */ - - if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT) + int rc; + switch(*tcode) { - if (!set_start_bits(tcode, start_bits, caseless, utf8, cd)) - return FALSE; - try_next = FALSE; - } + /* Fail if we reach something we don't understand */ - else switch(*tcode) - { default: - return FALSE; + return SSB_FAIL; + + /* If we hit a bracket or a positive lookahead assertion, recurse to set + bits from within the subpattern. If it can't find anything, we have to + give up. If it finds some mandatory character(s), we are done for this + branch. Otherwise, carry on scanning after the subpattern. */ + + case OP_BRA: + case OP_SBRA: + case OP_CBRA: + case OP_SCBRA: + case OP_ONCE: + case OP_ASSERT: + rc = set_start_bits(tcode, start_bits, caseless, utf8, cd); + if (rc == SSB_FAIL) return SSB_FAIL; + if (rc == SSB_DONE) try_next = FALSE; else + { + do tcode += GET(tcode, 1); while (*tcode == OP_ALT); + tcode += 1 + LINK_SIZE; + } + break; - /* Skip over callout */ + /* If we hit ALT or KET, it means we haven't found anything mandatory in + this branch, though we might have found something optional. For ALT, we + continue with the next alternative, but we have to arrange that the final + result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET, + return SSB_CONTINUE: if this is the top level, that indicates failure, + but after a nested subpattern, it causes scanning to continue. */ - case OP_CALLOUT: - tcode += 2 + 2*LINK_SIZE; + case OP_ALT: + yield = SSB_CONTINUE; + try_next = FALSE; break; - /* Skip over extended extraction bracket number */ + case OP_KET: + case OP_KETRMAX: + case OP_KETRMIN: + return SSB_CONTINUE; - case OP_BRANUMBER: - tcode += 3; + /* Skip over callout */ + + case OP_CALLOUT: + tcode += 2 + 2*LINK_SIZE; break; /* Skip over lookbehind and negative lookahead assertions */ @@ -154,7 +190,7 @@ do case OP_ASSERTBACK: case OP_ASSERTBACK_NOT: do tcode += GET(tcode, 1); while (*tcode == OP_ALT); - tcode += 1+LINK_SIZE; + tcode += 1 + LINK_SIZE; break; /* Skip over an option setting, changing the caseless flag */ @@ -168,27 +204,30 @@ do case OP_BRAZERO: case OP_BRAMINZERO: - if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd)) - return FALSE; + if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL) + return SSB_FAIL; /* ========================================================================= See the comment at the head of this function concerning the next line, which was an old fudge for the benefit of OS/2. dummy = 1; ========================================================================= */ do tcode += GET(tcode,1); while (*tcode == OP_ALT); - tcode += 1+LINK_SIZE; + tcode += 1 + LINK_SIZE; break; /* Single-char * or ? sets the bit and tries the next item */ case OP_STAR: case OP_MINSTAR: + case OP_POSSTAR: case OP_QUERY: case OP_MINQUERY: + case OP_POSQUERY: set_bit(start_bits, tcode[1], caseless, cd); tcode += 2; #ifdef SUPPORT_UTF8 - if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; + if (utf8 && tcode[-1] >= 0xc0) + tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; #endif break; @@ -196,10 +235,12 @@ do case OP_UPTO: case OP_MINUPTO: + case OP_POSUPTO: set_bit(start_bits, tcode[3], caseless, cd); tcode += 4; #ifdef SUPPORT_UTF8 - if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++; + if (utf8 && tcode[-1] >= 0xc0) + tcode += _pcre_utf8_table4[tcode[-1] & 0x3f]; #endif break; @@ -212,6 +253,7 @@ do case OP_CHARNC: case OP_PLUS: case OP_MINPLUS: + case OP_POSPLUS: set_bit(start_bits, tcode[1], caseless, cd); try_next = FALSE; break; @@ -285,16 +327,19 @@ do case OP_TYPEUPTO: case OP_TYPEMINUPTO: + case OP_TYPEPOSUPTO: tcode += 2; /* Fall through */ case OP_TYPESTAR: case OP_TYPEMINSTAR: + case OP_TYPEPOSSTAR: case OP_TYPEQUERY: case OP_TYPEMINQUERY: + case OP_TYPEPOSQUERY: switch(tcode[1]) { case OP_ANY: - return FALSE; + return SSB_FAIL; case OP_NOT_DIGIT: for (c = 0; c < 32; c++) @@ -420,7 +465,7 @@ do code += GET(code, 1); /* Advance to next branch */ } while (*code == OP_ALT); -return TRUE; +return yield; } @@ -494,8 +539,8 @@ compile_block.ctypes = tables + ctypes_offset; /* See if we can find a fixed set of initial characters for the pattern. */ memset(start_bits, 0, 32 * sizeof(uschar)); -if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, - (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL; +if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0, + (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL; /* Get a pcre_extra block and a pcre_study_data block. The study data is put in the latter, which is pointed to by the former, which may also get additional