1 /* $Cambridge: exim/src/src/pcre/pcre_study.c,v 1.6 2007/11/12 13:02:20 nm4 Exp $ */
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
10 Written by Philip Hazel
11 Copyright (c) 1997-2007 University of Cambridge
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
43 /* This module contains the external function pcre_study(), along with local
44 supporting functions. */
51 #include "pcre_internal.h"
54 /* Returns from set_start_bits() */
56 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
59 /*************************************************
60 * Set a bit and maybe its alternate case *
61 *************************************************/
63 /* Given a character, set its bit in the table, and also the bit for the other
64 version of a letter if we are caseless.
67 start_bits points to the bit map
69 caseless the caseless flag
70 cd the block with char table pointers
76 set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
78 start_bits[c/8] |= (1 << (c&7));
79 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
80 start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
85 /*************************************************
86 * Create bitmap of starting bytes *
87 *************************************************/
89 /* This function scans a compiled unanchored expression recursively and
90 attempts to build a bitmap of the set of possible starting bytes. As time goes
91 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
92 useful for parenthesized groups in patterns such as (a*)b where the group
93 provides some optional starting bytes but scanning must continue at the outer
94 level to find at least one mandatory byte. At the outermost level, this
95 function fails unless the result is SSB_DONE.
98 code points to an expression
99 start_bits points to a 32-byte table, initialized to 0
100 caseless the current state of the caseless flag
101 utf8 TRUE if in UTF-8 mode
102 cd the block with char table pointers
104 Returns: SSB_FAIL => Failed to find any starting bytes
105 SSB_DONE => Found mandatory starting bytes
106 SSB_CONTINUE => Found optional starting bytes
110 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
111 BOOL utf8, compile_data *cd)
114 int yield = SSB_DONE;
117 /* ========================================================================= */
118 /* The following comment and code was inserted in January 1999. In May 2006,
119 when it was observed to cause compiler warnings about unused values, I took it
120 out again. If anybody is still using OS/2, they will have to put it back
123 /* This next statement and the later reference to dummy are here in order to
124 trick the optimizer of the IBM C compiler for OS/2 into generating correct
125 code. Apparently IBM isn't going to fix the problem, and we would rather not
126 disable optimization (in this module it actually makes a big difference, and
127 the pcre module can use all the optimization it can get). */
130 /* ========================================================================= */
135 const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
136 BOOL try_next = TRUE;
138 while (try_next) /* Loop for items in this branch */
143 /* Fail if we reach something we don't understand */
148 /* If we hit a bracket or a positive lookahead assertion, recurse to set
149 bits from within the subpattern. If it can't find anything, we have to
150 give up. If it finds some mandatory character(s), we are done for this
151 branch. Otherwise, carry on scanning after the subpattern. */
159 rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
160 if (rc == SSB_FAIL) return SSB_FAIL;
161 if (rc == SSB_DONE) try_next = FALSE; else
163 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
164 tcode += 1 + LINK_SIZE;
168 /* If we hit ALT or KET, it means we haven't found anything mandatory in
169 this branch, though we might have found something optional. For ALT, we
170 continue with the next alternative, but we have to arrange that the final
171 result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
172 return SSB_CONTINUE: if this is the top level, that indicates failure,
173 but after a nested subpattern, it causes scanning to continue. */
176 yield = SSB_CONTINUE;
185 /* Skip over callout */
188 tcode += 2 + 2*LINK_SIZE;
191 /* Skip over lookbehind and negative lookahead assertions */
195 case OP_ASSERTBACK_NOT:
196 do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
197 tcode += 1 + LINK_SIZE;
200 /* Skip over an option setting, changing the caseless flag */
203 caseless = (tcode[1] & PCRE_CASELESS) != 0;
207 /* BRAZERO does the bracket, but carries on. */
211 if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
213 /* =========================================================================
214 See the comment at the head of this function concerning the next line,
215 which was an old fudge for the benefit of OS/2.
217 ========================================================================= */
218 do tcode += GET(tcode,1); while (*tcode == OP_ALT);
219 tcode += 1 + LINK_SIZE;
222 /* Single-char * or ? sets the bit and tries the next item */
230 set_bit(start_bits, tcode[1], caseless, cd);
233 if (utf8 && tcode[-1] >= 0xc0)
234 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
238 /* Single-char upto sets the bit and tries the next */
243 set_bit(start_bits, tcode[3], caseless, cd);
246 if (utf8 && tcode[-1] >= 0xc0)
247 tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
251 /* At least one single char sets the bit and stops */
253 case OP_EXACT: /* Fall through */
261 set_bit(start_bits, tcode[1], caseless, cd);
265 /* Single character type sets the bits and stops */
268 for (c = 0; c < 32; c++)
269 start_bits[c] |= ~cd->cbits[c+cbit_digit];
274 for (c = 0; c < 32; c++)
275 start_bits[c] |= cd->cbits[c+cbit_digit];
279 /* The cbit_space table has vertical tab as whitespace; we have to
282 case OP_NOT_WHITESPACE:
283 for (c = 0; c < 32; c++)
285 int d = cd->cbits[c+cbit_space];
286 if (c == 1) d &= ~0x08;
292 /* The cbit_space table has vertical tab as whitespace; we have to
296 for (c = 0; c < 32; c++)
298 int d = cd->cbits[c+cbit_space];
299 if (c == 1) d &= ~0x08;
305 case OP_NOT_WORDCHAR:
306 for (c = 0; c < 32; c++)
307 start_bits[c] |= ~cd->cbits[c+cbit_word];
312 for (c = 0; c < 32; c++)
313 start_bits[c] |= cd->cbits[c+cbit_word];
317 /* One or more character type fudges the pointer and restarts, knowing
318 it will hit a single character type and stop there. */
329 /* Zero or more repeats of character types set the bits and then
335 tcode += 2; /* Fall through */
341 case OP_TYPEMINQUERY:
342 case OP_TYPEPOSQUERY:
349 for (c = 0; c < 32; c++)
350 start_bits[c] |= ~cd->cbits[c+cbit_digit];
354 for (c = 0; c < 32; c++)
355 start_bits[c] |= cd->cbits[c+cbit_digit];
358 /* The cbit_space table has vertical tab as whitespace; we have to
361 case OP_NOT_WHITESPACE:
362 for (c = 0; c < 32; c++)
364 int d = cd->cbits[c+cbit_space];
365 if (c == 1) d &= ~0x08;
370 /* The cbit_space table has vertical tab as whitespace; we have to
374 for (c = 0; c < 32; c++)
376 int d = cd->cbits[c+cbit_space];
377 if (c == 1) d &= ~0x08;
382 case OP_NOT_WORDCHAR:
383 for (c = 0; c < 32; c++)
384 start_bits[c] |= ~cd->cbits[c+cbit_word];
388 for (c = 0; c < 32; c++)
389 start_bits[c] |= cd->cbits[c+cbit_word];
396 /* Character class where all the information is in a bit map: set the
397 bits and either carry on or not, according to the repeat count. If it was
398 a negative class, and we are operating with UTF-8 characters, any byte
399 with a value >= 0xc4 is a potentially valid starter because it starts a
400 character with a value > 255. */
406 start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
407 memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
416 /* In UTF-8 mode, the bits in a bit map correspond to character
417 values, not to byte values. However, the bit map we are constructing is
418 for byte values. So we have to do a conversion for characters whose
419 value is > 127. In fact, there are only two possible starting bytes for
420 characters in the range 128 - 255. */
425 for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
426 for (c = 128; c < 256; c++)
428 if ((tcode[c/8] && (1 << (c&7))) != 0)
430 int d = (c >> 6) | 0xc0; /* Set bit for this starter */
431 start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
432 c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
437 /* In non-UTF-8 mode, the two bit maps are completely compatible. */
442 for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
445 /* Advance past the bit map, and act on what follows */
459 if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
460 else try_next = FALSE;
468 break; /* End of bitmap class handling */
470 } /* End of switch */
471 } /* End of try_next loop */
473 code += GET(code, 1); /* Advance to next branch */
475 while (*code == OP_ALT);
481 /*************************************************
482 * Study a compiled expression *
483 *************************************************/
485 /* This function is handed a compiled expression that it must study to produce
486 information that will speed up the matching. It returns a pcre_extra block
487 which then gets handed back to pcre_exec().
490 re points to the compiled expression
491 options contains option bits
492 errorptr points to where to place error messages;
493 set NULL unless error
495 Returns: pointer to a pcre_extra block, with study_data filled in and the
496 appropriate flag set;
497 NULL on error or if no optimization possible
500 PCRE_EXP_DEFN pcre_extra *
501 pcre_study(const pcre *external_re, int options, const char **errorptr)
503 uschar start_bits[32];
505 pcre_study_data *study;
506 const uschar *tables;
508 compile_data compile_block;
509 const real_pcre *re = (const real_pcre *)external_re;
513 if (re == NULL || re->magic_number != MAGIC_NUMBER)
515 *errorptr = "argument is not a compiled regular expression";
519 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
521 *errorptr = "unknown or incorrect option bit(s) set";
525 code = (uschar *)re + re->name_table_offset +
526 (re->name_count * re->name_entry_size);
528 /* For an anchored pattern, or an unanchored pattern that has a first char, or
529 a multiline pattern that matches only at "line starts", no further processing
532 if ((re->options & PCRE_ANCHORED) != 0 ||
533 (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
536 /* Set the character tables in the block that is passed around */
540 (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
543 compile_block.lcc = tables + lcc_offset;
544 compile_block.fcc = tables + fcc_offset;
545 compile_block.cbits = tables + cbits_offset;
546 compile_block.ctypes = tables + ctypes_offset;
548 /* See if we can find a fixed set of initial characters for the pattern. */
550 memset(start_bits, 0, 32 * sizeof(uschar));
551 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
552 (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
554 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
555 the latter, which is pointed to by the former, which may also get additional
556 data set later by the calling program. At the moment, the size of
557 pcre_study_data is fixed. We nevertheless save it in a field for returning via
558 the pcre_fullinfo() function so that if it becomes variable in the future, we
559 don't have to change that code. */
561 extra = (pcre_extra *)(pcre_malloc)
562 (sizeof(pcre_extra) + sizeof(pcre_study_data));
566 *errorptr = "failed to get memory";
570 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
571 extra->flags = PCRE_EXTRA_STUDY_DATA;
572 extra->study_data = study;
574 study->size = sizeof(pcre_study_data);
575 study->options = PCRE_STUDY_MAPPED;
576 memcpy(study->start_bits, start_bits, sizeof(start_bits));
581 /* End of pcre_study.c */