-/* $Cambridge: exim/src/src/pcre/pcre_study.c,v 1.3 2006/11/07 16:50:36 ph10 Exp $ */
+/* $Cambridge: exim/src/src/pcre/pcre_study.c,v 1.4 2007/01/23 15:08:45 ph10 Exp $ */
/*************************************************
* Perl-Compatible Regular Expressions *
#include "pcre_internal.h"
+/* Returns from set_start_bits() */
+
+enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
+
+
/*************************************************
* Set a bit and maybe its alternate case *
*************************************************/
/*************************************************
-* Create bitmap of starting chars *
+* Create bitmap of starting bytes *
*************************************************/
-/* This function scans a compiled unanchored expression and attempts to build a
-bitmap of the set of initial characters. If it can't, it returns FALSE. As time
-goes by, we may be able to get more clever at doing this.
+/* This function scans a compiled unanchored expression recursively and
+attempts to build a bitmap of the set of possible starting bytes. As time goes
+by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
+useful for parenthesized groups in patterns such as (a*)b where the group
+provides some optional starting bytes but scanning must continue at the outer
+level to find at least one mandatory byte. At the outermost level, this
+function fails unless the result is SSB_DONE.
Arguments:
code points to an expression
utf8 TRUE if in UTF-8 mode
cd the block with char table pointers
-Returns: TRUE if table built, FALSE otherwise
+Returns: SSB_FAIL => Failed to find any starting bytes
+ SSB_DONE => Found mandatory starting bytes
+ SSB_CONTINUE => Found optional starting bytes
*/
-static BOOL
+static int
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
BOOL utf8, compile_data *cd)
{
register int c;
+int yield = SSB_DONE;
#if 0
/* ========================================================================= */
do
{
- const uschar *tcode = code + 1 + LINK_SIZE;
+ const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
BOOL try_next = TRUE;
- while (try_next)
+ while (try_next) /* Loop for items in this branch */
{
- /* If a branch starts with a bracket or a positive lookahead assertion,
- recurse to set bits from within them. That's all for this branch. */
-
- if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
+ int rc;
+ switch(*tcode)
{
- if (!set_start_bits(tcode, start_bits, caseless, utf8, cd))
- return FALSE;
- try_next = FALSE;
- }
+ /* Fail if we reach something we don't understand */
- else switch(*tcode)
- {
default:
- return FALSE;
+ return SSB_FAIL;
+
+ /* If we hit a bracket or a positive lookahead assertion, recurse to set
+ bits from within the subpattern. If it can't find anything, we have to
+ give up. If it finds some mandatory character(s), we are done for this
+ branch. Otherwise, carry on scanning after the subpattern. */
+
+ case OP_BRA:
+ case OP_SBRA:
+ case OP_CBRA:
+ case OP_SCBRA:
+ case OP_ONCE:
+ case OP_ASSERT:
+ rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
+ if (rc == SSB_FAIL) return SSB_FAIL;
+ if (rc == SSB_DONE) try_next = FALSE; else
+ {
+ do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
+ tcode += 1 + LINK_SIZE;
+ }
+ break;
- /* Skip over callout */
+ /* If we hit ALT or KET, it means we haven't found anything mandatory in
+ this branch, though we might have found something optional. For ALT, we
+ continue with the next alternative, but we have to arrange that the final
+ result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
+ return SSB_CONTINUE: if this is the top level, that indicates failure,
+ but after a nested subpattern, it causes scanning to continue. */
- case OP_CALLOUT:
- tcode += 2 + 2*LINK_SIZE;
+ case OP_ALT:
+ yield = SSB_CONTINUE;
+ try_next = FALSE;
break;
- /* Skip over extended extraction bracket number */
+ case OP_KET:
+ case OP_KETRMAX:
+ case OP_KETRMIN:
+ return SSB_CONTINUE;
- case OP_BRANUMBER:
- tcode += 3;
+ /* Skip over callout */
+
+ case OP_CALLOUT:
+ tcode += 2 + 2*LINK_SIZE;
break;
/* Skip over lookbehind and negative lookahead assertions */
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
- tcode += 1+LINK_SIZE;
+ tcode += 1 + LINK_SIZE;
break;
/* Skip over an option setting, changing the caseless flag */
case OP_BRAZERO:
case OP_BRAMINZERO:
- if (!set_start_bits(++tcode, start_bits, caseless, utf8, cd))
- return FALSE;
+ if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
+ return SSB_FAIL;
/* =========================================================================
See the comment at the head of this function concerning the next line,
which was an old fudge for the benefit of OS/2.
dummy = 1;
========================================================================= */
do tcode += GET(tcode,1); while (*tcode == OP_ALT);
- tcode += 1+LINK_SIZE;
+ tcode += 1 + LINK_SIZE;
break;
/* Single-char * or ? sets the bit and tries the next item */
case OP_STAR:
case OP_MINSTAR:
+ case OP_POSSTAR:
case OP_QUERY:
case OP_MINQUERY:
+ case OP_POSQUERY:
set_bit(start_bits, tcode[1], caseless, cd);
tcode += 2;
#ifdef SUPPORT_UTF8
- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
+ if (utf8 && tcode[-1] >= 0xc0)
+ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
break;
case OP_UPTO:
case OP_MINUPTO:
+ case OP_POSUPTO:
set_bit(start_bits, tcode[3], caseless, cd);
tcode += 4;
#ifdef SUPPORT_UTF8
- if (utf8) while ((*tcode & 0xc0) == 0x80) tcode++;
+ if (utf8 && tcode[-1] >= 0xc0)
+ tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
#endif
break;
case OP_CHARNC:
case OP_PLUS:
case OP_MINPLUS:
+ case OP_POSPLUS:
set_bit(start_bits, tcode[1], caseless, cd);
try_next = FALSE;
break;
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
+ case OP_TYPEPOSUPTO:
tcode += 2; /* Fall through */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
+ case OP_TYPEPOSSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
+ case OP_TYPEPOSQUERY:
switch(tcode[1])
{
case OP_ANY:
- return FALSE;
+ return SSB_FAIL;
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
code += GET(code, 1); /* Advance to next branch */
}
while (*code == OP_ALT);
-return TRUE;
+return yield;
}
/* See if we can find a fixed set of initial characters for the pattern. */
memset(start_bits, 0, 32 * sizeof(uschar));
-if (!set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
- (re->options & PCRE_UTF8) != 0, &compile_block)) return NULL;
+if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
+ (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
/* Get a pcre_extra block and a pcre_study_data block. The study data is put in
the latter, which is pointed to by the former, which may also get additional