1 /*************************************************
2 * PCRE testing program *
3 *************************************************/
5 /* This program was hacked up as a tester for PCRE. I really should have
6 written it more tidily in the first place. Will I ever learn? It has grown and
7 been extended and consequently is now rather untidy in places.
9 -----------------------------------------------------------------------------
10 Redistribution and use in source and binary forms, with or without
11 modification, are permitted provided that the following conditions are met:
13 * Redistributions of source code must retain the above copyright notice,
14 this list of conditions and the following disclaimer.
16 * Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
20 * Neither the name of the University of Cambridge nor the names of its
21 contributors may be used to endorse or promote products derived from
22 this software without specific prior written permission.
24 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
28 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 POSSIBILITY OF SUCH DAMAGE.
35 -----------------------------------------------------------------------------
47 /* We need the internal info for displaying the results of pcre_study(). Also
48 for getting the opcodes for showing compiled code. */
50 #define PCRE_SPY /* For Win32 build, import data, not export */
53 /* It is possible to compile this test program without including support for
54 testing the POSIX interface, though this is not available via the standard
58 #include "pcreposix.h"
61 #ifndef CLOCKS_PER_SEC
63 #define CLOCKS_PER_SEC CLK_TCK
65 #define CLOCKS_PER_SEC 100
69 #define LOOPREPEAT 500000
71 #define BUFFER_SIZE 30000
72 #define PBUFFER_SIZE BUFFER_SIZE
73 #define DBUFFER_SIZE BUFFER_SIZE
77 static int log_store = 0;
78 static int callout_count;
79 static int callout_extra;
80 static int callout_fail_count;
81 static int callout_fail_id;
82 static int first_callout;
83 static int show_malloc;
85 static size_t gotten_store;
87 static uschar *pbuffer = NULL;
90 static const int utf8_table1[] = {
91 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
93 static const int utf8_table2[] = {
94 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
96 static const int utf8_table3[] = {
97 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
101 /*************************************************
102 * Print compiled regex *
103 *************************************************/
105 /* The code for doing this is held in a separate file that is also included in
106 pcre.c when it is compiled with the debug switch. It defines a function called
107 print_internals(), which uses a table of opcode lengths defined by the macro
108 OP_LENGTHS, whose name must be OP_lengths. It also uses a table that translates
109 Unicode property names to numbers; this is kept in a separate file. */
111 static uschar OP_lengths[] = { OP_LENGTHS };
115 #include "ucptypetable.c"
118 #include "printint.c"
122 /*************************************************
123 * Read number from string *
124 *************************************************/
126 /* We don't use strtoul() because SunOS4 doesn't have it. Rather than mess
127 around with conditional compilation, just do the job by hand. It is only used
128 for unpicking the -o argument, so just keep it simple.
131 str string to be converted
132 endptr where to put the end pointer
134 Returns: the unsigned long
138 get_value(unsigned char *str, unsigned char **endptr)
141 while(*str != 0 && isspace(*str)) str++;
142 while (isdigit(*str)) result = result * 10 + (int)(*str++ - '0');
149 /*************************************************
150 * Convert character value to UTF-8 *
151 *************************************************/
153 /* This function takes an integer value in the range 0 - 0x7fffffff
154 and encodes it as a UTF-8 character in 0 to 6 bytes.
157 cvalue the character value
158 buffer pointer to buffer for result - at least 6 bytes long
160 Returns: number of characters placed in the buffer
161 -1 if input character is negative
162 0 if input character is positive but too big (only when
163 int is longer than 32 bits)
167 ord2utf8(int cvalue, unsigned char *buffer)
170 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
171 if (cvalue <= utf8_table1[i]) break;
172 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
173 if (cvalue < 0) return -1;
176 for (j = i; j > 0; j--)
178 *buffer-- = 0x80 | (cvalue & 0x3f);
181 *buffer = utf8_table2[i] | cvalue;
186 /*************************************************
187 * Convert UTF-8 string to value *
188 *************************************************/
190 /* This function takes one or more bytes that represents a UTF-8 character,
191 and returns the value of the character.
194 buffer a pointer to the byte vector
195 vptr a pointer to an int to receive the value
197 Returns: > 0 => the number of bytes consumed
198 -6 to 0 => malformed UTF-8 character at offset = (-return)
202 utf82ord(unsigned char *buffer, int *vptr)
208 for (i = -1; i < 6; i++) /* i is number of additional bytes */
210 if ((d & 0x80) == 0) break;
214 if (i == -1) { *vptr = c; return 1; } /* ascii character */
215 if (i == 0 || i == 6) return 0; /* invalid UTF-8 */
217 /* i now has a value in the range 1-5 */
220 d = (c & utf8_table3[i]) << s;
222 for (j = 0; j < i; j++)
225 if ((c & 0xc0) != 0x80) return -(j+1);
227 d |= (c & 0x3f) << s;
230 /* Check that encoding was the correct unique one */
232 for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
233 if (d <= utf8_table1[j]) break;
234 if (j != i) return -(i+1);
244 /*************************************************
245 * Print character string *
246 *************************************************/
248 /* Character string printing function. Must handle UTF-8 strings in utf8
249 mode. Yields number of characters printed. If handed a NULL file, just counts
250 chars without printing. */
252 static int pchars(unsigned char *p, int length, FILE *f)
261 int rc = utf82ord(p, &c);
263 if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */
267 if (c < 256 && isprint(c))
269 if (f != NULL) fprintf(f, "%c", c);
275 if (f != NULL) fprintf(f, "\\x{%02x}%n", c, &n);
282 /* Not UTF-8, or malformed UTF-8 */
284 if (isprint(c = *(p++)))
286 if (f != NULL) fprintf(f, "%c", c);
291 if (f != NULL) fprintf(f, "\\x%02x", c);
301 /*************************************************
303 *************************************************/
305 /* Called from PCRE as a result of the (?C) item. We print out where we are in
306 the match. Yield zero unless more callouts than the fail count, or the callout
309 static int callout(pcre_callout_block *cb)
311 FILE *f = (first_callout | callout_extra)? outfile : NULL;
312 int i, pre_start, post_start, subject_length;
316 fprintf(f, "Callout %d: last capture = %d\n",
317 cb->callout_number, cb->capture_last);
319 for (i = 0; i < cb->capture_top * 2; i += 2)
321 if (cb->offset_vector[i] < 0)
322 fprintf(f, "%2d: <unset>\n", i/2);
325 fprintf(f, "%2d: ", i/2);
326 (void)pchars((unsigned char *)cb->subject + cb->offset_vector[i],
327 cb->offset_vector[i+1] - cb->offset_vector[i], f);
333 /* Re-print the subject in canonical form, the first time or if giving full
334 datails. On subsequent calls in the same match, we use pchars just to find the
335 printed lengths of the substrings. */
337 if (f != NULL) fprintf(f, "--->");
339 pre_start = pchars((unsigned char *)cb->subject, cb->start_match, f);
340 post_start = pchars((unsigned char *)(cb->subject + cb->start_match),
341 cb->current_position - cb->start_match, f);
343 subject_length = pchars((unsigned char *)cb->subject, cb->subject_length, NULL);
345 (void)pchars((unsigned char *)(cb->subject + cb->current_position),
346 cb->subject_length - cb->current_position, f);
348 if (f != NULL) fprintf(f, "\n");
350 /* Always print appropriate indicators, with callout number if not already
351 shown. For automatic callouts, show the pattern offset. */
353 if (cb->callout_number == 255)
355 fprintf(outfile, "%+3d ", cb->pattern_position);
356 if (cb->pattern_position > 99) fprintf(outfile, "\n ");
360 if (callout_extra) fprintf(outfile, " ");
361 else fprintf(outfile, "%3d ", cb->callout_number);
364 for (i = 0; i < pre_start; i++) fprintf(outfile, " ");
365 fprintf(outfile, "^");
369 for (i = 0; i < post_start - 1; i++) fprintf(outfile, " ");
370 fprintf(outfile, "^");
373 for (i = 0; i < subject_length - pre_start - post_start + 4; i++)
374 fprintf(outfile, " ");
376 fprintf(outfile, "%.*s", (cb->next_item_length == 0)? 1 : cb->next_item_length,
377 pbuffer + cb->pattern_position);
379 fprintf(outfile, "\n");
382 if (cb->callout_data != NULL)
384 int callout_data = *((int *)(cb->callout_data));
385 if (callout_data != 0)
387 fprintf(outfile, "Callout data = %d\n", callout_data);
392 return (cb->callout_number != callout_fail_id)? 0 :
393 (++callout_count >= callout_fail_count)? 1 : 0;
397 /*************************************************
398 * Local malloc functions *
399 *************************************************/
401 /* Alternative malloc function, to test functionality and show the size of the
404 static void *new_malloc(size_t size)
406 void *block = malloc(size);
409 fprintf(outfile, "malloc %3d %p\n", (int)size, block);
413 static void new_free(void *block)
416 fprintf(outfile, "free %p\n", block);
421 /* For recursion malloc/free, to test stacking calls */
423 static void *stack_malloc(size_t size)
425 void *block = malloc(size);
427 fprintf(outfile, "stack_malloc %3d %p\n", (int)size, block);
431 static void stack_free(void *block)
434 fprintf(outfile, "stack_free %p\n", block);
439 /*************************************************
440 * Call pcre_fullinfo() *
441 *************************************************/
443 /* Get one piece of information from the pcre_fullinfo() function */
445 static void new_info(pcre *re, pcre_extra *study, int option, void *ptr)
448 if ((rc = pcre_fullinfo(re, study, option, ptr)) < 0)
449 fprintf(outfile, "Error %d from pcre_fullinfo(%d)\n", rc, option);
454 /*************************************************
455 * Byte flipping function *
456 *************************************************/
459 byteflip(long int value, int n)
461 if (n == 2) return ((value & 0x00ff) << 8) | ((value & 0xff00) >> 8);
462 return ((value & 0x000000ff) << 24) |
463 ((value & 0x0000ff00) << 8) |
464 ((value & 0x00ff0000) >> 8) |
465 ((value & 0xff000000) >> 24);
471 /*************************************************
473 *************************************************/
475 /* Read lines from named file or stdin and write to named file or stdout; lines
476 consist of a regular expression, in delimiters and optionally followed by
477 options, followed by a set of test data, terminated by an empty line. */
479 int main(int argc, char **argv)
481 FILE *infile = stdin;
483 int study_options = 0;
488 int size_offsets = 45;
489 int size_offsets_max;
497 unsigned char *buffer;
498 unsigned char *dbuffer;
500 /* Get buffers from malloc() so that Electric Fence will check their misuse
501 when I am debugging. */
503 buffer = (unsigned char *)malloc(BUFFER_SIZE);
504 dbuffer = (unsigned char *)malloc(DBUFFER_SIZE);
505 pbuffer = (unsigned char *)malloc(PBUFFER_SIZE);
507 /* The outfile variable is static so that new_malloc can use it. The _setmode()
508 stuff is some magic that I don't understand, but which apparently does good
509 things in Windows. It's related to line terminations. */
511 #if defined(_WIN32) || defined(WIN32)
512 _setmode( _fileno( stdout ), 0x8000 );
513 #endif /* defined(_WIN32) || defined(WIN32) */
519 while (argc > 1 && argv[op][0] == '-')
521 unsigned char *endptr;
523 if (strcmp(argv[op], "-s") == 0 || strcmp(argv[op], "-m") == 0)
525 else if (strcmp(argv[op], "-t") == 0) timeit = 1;
526 else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
527 else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
528 else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
529 ((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
536 else if (strcmp(argv[op], "-p") == 0) posix = 1;
538 else if (strcmp(argv[op], "-C") == 0)
541 printf("PCRE version %s\n", pcre_version());
542 printf("Compiled with\n");
543 (void)pcre_config(PCRE_CONFIG_UTF8, &rc);
544 printf(" %sUTF-8 support\n", rc? "" : "No ");
545 (void)pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &rc);
546 printf(" %sUnicode properties support\n", rc? "" : "No ");
547 (void)pcre_config(PCRE_CONFIG_NEWLINE, &rc);
548 printf(" Newline character is %s\n", (rc == '\r')? "CR" : "LF");
549 (void)pcre_config(PCRE_CONFIG_LINK_SIZE, &rc);
550 printf(" Internal link size = %d\n", rc);
551 (void)pcre_config(PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, &rc);
552 printf(" POSIX malloc threshold = %d\n", rc);
553 (void)pcre_config(PCRE_CONFIG_MATCH_LIMIT, &rc);
554 printf(" Default match limit = %d\n", rc);
555 (void)pcre_config(PCRE_CONFIG_STACKRECURSE, &rc);
556 printf(" Match recursion uses %s\n", rc? "stack" : "heap");
561 printf("** Unknown or malformed option %s\n", argv[op]);
562 printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
563 printf(" -C show PCRE compile-time options and exit\n");
564 printf(" -d debug: show compiled code; implies -i\n"
565 " -i show information about compiled pattern\n"
566 " -m output memory used information\n"
567 " -o <n> set size of offsets vector to <n>\n");
569 printf(" -p use POSIX interface\n");
571 printf(" -s output store (memory) used information\n"
572 " -t time compilation and execution\n");
579 /* Get the store for the offsets vector, and remember what it was */
581 size_offsets_max = size_offsets;
582 offsets = (int *)malloc(size_offsets_max * sizeof(int));
585 printf("** Failed to get %d bytes of memory for offsets vector\n",
586 size_offsets_max * sizeof(int));
590 /* Sort out the input and output files */
594 infile = fopen(argv[op], "rb");
597 printf("** Failed to open %s\n", argv[op]);
604 outfile = fopen(argv[op+1], "wb");
607 printf("** Failed to open %s\n", argv[op+1]);
612 /* Set alternative malloc function */
614 pcre_malloc = new_malloc;
615 pcre_free = new_free;
616 pcre_stack_malloc = stack_malloc;
617 pcre_stack_free = stack_free;
619 /* Heading line, then prompt for first regex if stdin */
621 fprintf(outfile, "PCRE version %s\n\n", pcre_version());
628 pcre_extra *extra = NULL;
630 #if !defined NOPOSIX /* There are still compilers that require no indent */
636 unsigned char *p, *pp, *ppp;
637 unsigned char *to_file = NULL;
638 const unsigned char *tables = NULL;
639 unsigned long int true_size, true_study_size = 0;
640 size_t size, regex_gotten_store;
642 int do_debug = debug;
645 int do_showinfo = showinfo;
648 int erroroffset, len, delimiter;
652 if (infile == stdin) printf(" re> ");
653 if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL) break;
654 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
658 while (isspace(*p)) p++;
659 if (*p == 0) continue;
661 /* See if the pattern is to be loaded pre-compiled from a file. */
663 if (*p == '<' && strchr((char *)(p+1), '<') == NULL)
665 unsigned long int magic;
670 pp = p + (int)strlen((char *)p);
671 while (isspace(pp[-1])) pp--;
674 f = fopen((char *)p, "rb");
677 fprintf(outfile, "Failed to open %s: %s\n", p, strerror(errno));
681 if (fread(sbuf, 1, 8, f) != 8) goto FAIL_READ;
684 (sbuf[0] << 24) | (sbuf[1] << 16) | (sbuf[2] << 8) | sbuf[3];
686 (sbuf[4] << 24) | (sbuf[5] << 16) | (sbuf[6] << 8) | sbuf[7];
688 re = (real_pcre *)new_malloc(true_size);
689 regex_gotten_store = gotten_store;
691 if (fread(re, 1, true_size, f) != true_size) goto FAIL_READ;
693 magic = ((real_pcre *)re)->magic_number;
694 if (magic != MAGIC_NUMBER)
696 if (byteflip(magic, sizeof(magic)) == MAGIC_NUMBER)
702 fprintf(outfile, "Data in %s is not a compiled PCRE regex\n", p);
708 fprintf(outfile, "Compiled regex%s loaded from %s\n",
709 do_flip? " (byte-inverted)" : "", p);
711 /* Need to know if UTF-8 for printing data strings */
713 new_info(re, NULL, PCRE_INFO_OPTIONS, &options);
714 use_utf8 = (options & PCRE_UTF8) != 0;
716 /* Now see if there is any following study data */
718 if (true_study_size != 0)
720 pcre_study_data *psd;
722 extra = (pcre_extra *)new_malloc(sizeof(pcre_extra) + true_study_size);
723 extra->flags = PCRE_EXTRA_STUDY_DATA;
725 psd = (pcre_study_data *)(((char *)extra) + sizeof(pcre_extra));
726 extra->study_data = psd;
728 if (fread(psd, 1, true_study_size, f) != true_study_size)
731 fprintf(outfile, "Failed to read data from %s\n", p);
732 if (extra != NULL) new_free(extra);
733 if (re != NULL) new_free(re);
737 fprintf(outfile, "Study data loaded from %s\n", p);
738 do_study = 1; /* To get the data output if requested */
740 else fprintf(outfile, "No study data\n");
746 /* In-line pattern (the usual case). Get the delimiter and seek the end of
747 the pattern; if is isn't complete, read more. */
751 if (isalnum(delimiter) || delimiter == '\\')
753 fprintf(outfile, "** Delimiter must not be alphameric or \\\n");
763 if (*pp == '\\' && pp[1] != 0) pp++;
764 else if (*pp == delimiter) break;
769 len = BUFFER_SIZE - (pp - buffer);
772 fprintf(outfile, "** Expression too long - missing delimiter?\n");
776 if (infile == stdin) printf(" > ");
777 if (fgets((char *)pp, len, infile) == NULL)
779 fprintf(outfile, "** Unexpected EOF\n");
783 if (infile != stdin) fprintf(outfile, "%s", (char *)pp);
786 /* If the first character after the delimiter is backslash, make
787 the pattern end with backslash. This is purely to provide a way
788 of testing for the error message when a pattern ends with backslash. */
790 if (pp[1] == '\\') *pp++ = '\\';
792 /* Terminate the pattern at the delimiter, and save a copy of the pattern
796 strcpy((char *)pbuffer, (char *)p);
798 /* Look for options after final delimiter */
802 log_store = showstore; /* default from command line */
808 case 'g': do_g = 1; break;
809 case 'i': options |= PCRE_CASELESS; break;
810 case 'm': options |= PCRE_MULTILINE; break;
811 case 's': options |= PCRE_DOTALL; break;
812 case 'x': options |= PCRE_EXTENDED; break;
814 case '+': do_showrest = 1; break;
815 case 'A': options |= PCRE_ANCHORED; break;
816 case 'C': options |= PCRE_AUTO_CALLOUT; break;
817 case 'D': do_debug = do_showinfo = 1; break;
818 case 'E': options |= PCRE_DOLLAR_ENDONLY; break;
819 case 'F': do_flip = 1; break;
820 case 'G': do_G = 1; break;
821 case 'I': do_showinfo = 1; break;
822 case 'M': log_store = 1; break;
823 case 'N': options |= PCRE_NO_AUTO_CAPTURE; break;
826 case 'P': do_posix = 1; break;
829 case 'S': do_study = 1; break;
830 case 'U': options |= PCRE_UNGREEDY; break;
831 case 'X': options |= PCRE_EXTRA; break;
832 case '8': options |= PCRE_UTF8; use_utf8 = 1; break;
833 case '?': options |= PCRE_NO_UTF8_CHECK; break;
837 while (*ppp != '\n' && *ppp != ' ') ppp++;
839 if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
841 fprintf(outfile, "** Failed to set locale \"%s\"\n", pp);
844 tables = pcre_maketables();
850 while (*pp != 0) pp++;
851 while (isspace(pp[-1])) pp--;
855 case '\n': case ' ': break;
858 fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
863 /* Handle compiling via the POSIX interface, which doesn't support the
864 timing, showing, or debugging options, nor the ability to pass over
865 local character tables. */
868 if (posix || do_posix)
873 if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
874 if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
875 rc = regcomp(&preg, (char *)p, cflags);
877 /* Compilation failed; go back for another re, skipping to blank line
878 if non-interactive. */
882 (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
883 fprintf(outfile, "Failed: POSIX code %d: %s\n", rc, buffer);
888 /* Handle compiling via the native interface */
891 #endif /* !defined NOPOSIX */
898 clock_t start_time = clock();
899 for (i = 0; i < LOOPREPEAT; i++)
901 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
902 if (re != NULL) free(re);
904 time_taken = clock() - start_time;
905 fprintf(outfile, "Compile time %.3f milliseconds\n",
906 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
907 (double)CLOCKS_PER_SEC);
910 re = pcre_compile((char *)p, options, &error, &erroroffset, tables);
912 /* Compilation failed; go back for another re, skipping to blank line
913 if non-interactive. */
917 fprintf(outfile, "Failed: %s at offset %d\n", error, erroroffset);
923 if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
928 len = (int)strlen((char *)buffer);
929 while (len > 0 && isspace(buffer[len-1])) len--;
932 fprintf(outfile, "\n");
937 /* Compilation succeeded; print data if required. There are now two
938 info-returning functions. The old one has a limited interface and
939 returns only limited data. Check that it agrees with the newer one. */
942 fprintf(outfile, "Memory allocation (code space): %d\n",
945 ((real_pcre *)re)->name_count * ((real_pcre *)re)->name_entry_size));
947 /* Extract the size for possible writing before possibly flipping it,
948 and remember the store that was got. */
950 true_size = ((real_pcre *)re)->size;
951 regex_gotten_store = gotten_store;
953 /* If /S was present, study the regexp to generate additional info to
954 help with the matching. */
962 clock_t start_time = clock();
963 for (i = 0; i < LOOPREPEAT; i++)
964 extra = pcre_study(re, study_options, &error);
965 time_taken = clock() - start_time;
966 if (extra != NULL) free(extra);
967 fprintf(outfile, " Study time %.3f milliseconds\n",
968 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
969 (double)CLOCKS_PER_SEC);
971 extra = pcre_study(re, study_options, &error);
973 fprintf(outfile, "Failed to study: %s\n", error);
974 else if (extra != NULL)
975 true_study_size = ((pcre_study_data *)(extra->study_data))->size;
978 /* If the 'F' option was present, we flip the bytes of all the integer
979 fields in the regex data block and the study block. This is to make it
980 possible to test PCRE's handling of byte-flipped patterns, e.g. those
981 compiled on a different architecture. */
985 real_pcre *rre = (real_pcre *)re;
986 rre->magic_number = byteflip(rre->magic_number, sizeof(rre->magic_number));
987 rre->size = byteflip(rre->size, sizeof(rre->size));
988 rre->options = byteflip(rre->options, sizeof(rre->options));
989 rre->top_bracket = byteflip(rre->top_bracket, sizeof(rre->top_bracket));
990 rre->top_backref = byteflip(rre->top_backref, sizeof(rre->top_backref));
991 rre->first_byte = byteflip(rre->first_byte, sizeof(rre->first_byte));
992 rre->req_byte = byteflip(rre->req_byte, sizeof(rre->req_byte));
993 rre->name_table_offset = byteflip(rre->name_table_offset,
994 sizeof(rre->name_table_offset));
995 rre->name_entry_size = byteflip(rre->name_entry_size,
996 sizeof(rre->name_entry_size));
997 rre->name_count = byteflip(rre->name_count, sizeof(rre->name_count));
1001 pcre_study_data *rsd = (pcre_study_data *)(extra->study_data);
1002 rsd->size = byteflip(rsd->size, sizeof(rsd->size));
1003 rsd->options = byteflip(rsd->options, sizeof(rsd->options));
1007 /* Extract information from the compiled data if required */
1013 unsigned long int get_options, all_options;
1014 int old_first_char, old_options, old_count;
1015 int count, backrefmax, first_char, need_char;
1016 int nameentrysize, namecount;
1017 const uschar *nametable;
1021 fprintf(outfile, "------------------------------------------------------------------\n");
1022 print_internals(re, outfile);
1025 new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
1026 new_info(re, NULL, PCRE_INFO_SIZE, &size);
1027 new_info(re, NULL, PCRE_INFO_CAPTURECOUNT, &count);
1028 new_info(re, NULL, PCRE_INFO_BACKREFMAX, &backrefmax);
1029 new_info(re, NULL, PCRE_INFO_FIRSTBYTE, &first_char);
1030 new_info(re, NULL, PCRE_INFO_LASTLITERAL, &need_char);
1031 new_info(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
1032 new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
1033 new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
1035 old_count = pcre_info(re, &old_options, &old_first_char);
1036 if (count < 0) fprintf(outfile,
1037 "Error %d from pcre_info()\n", count);
1040 if (old_count != count) fprintf(outfile,
1041 "Count disagreement: pcre_fullinfo=%d pcre_info=%d\n", count,
1044 if (old_first_char != first_char) fprintf(outfile,
1045 "First char disagreement: pcre_fullinfo=%d pcre_info=%d\n",
1046 first_char, old_first_char);
1048 if (old_options != (int)get_options) fprintf(outfile,
1049 "Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
1050 get_options, old_options);
1053 if (size != regex_gotten_store) fprintf(outfile,
1054 "Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
1055 (int)size, (int)regex_gotten_store);
1057 fprintf(outfile, "Capturing subpattern count = %d\n", count);
1059 fprintf(outfile, "Max back reference = %d\n", backrefmax);
1063 fprintf(outfile, "Named capturing subpatterns:\n");
1064 while (namecount-- > 0)
1066 fprintf(outfile, " %s %*s%3d\n", nametable + 2,
1067 nameentrysize - 3 - (int)strlen((char *)nametable + 2), "",
1068 GET2(nametable, 0));
1069 nametable += nameentrysize;
1073 /* The NOPARTIAL bit is a private bit in the options, so we have
1074 to fish it out via out back door */
1076 all_options = ((real_pcre *)re)->options;
1079 all_options = byteflip(all_options, sizeof(all_options));
1082 if ((all_options & PCRE_NOPARTIAL) != 0)
1083 fprintf(outfile, "Partial matching not supported\n");
1085 if (get_options == 0) fprintf(outfile, "No options\n");
1086 else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
1087 ((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
1088 ((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
1089 ((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
1090 ((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
1091 ((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
1092 ((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
1093 ((get_options & PCRE_EXTRA) != 0)? " extra" : "",
1094 ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "",
1095 ((get_options & PCRE_UTF8) != 0)? " utf8" : "",
1096 ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "");
1098 if (((((real_pcre *)re)->options) & PCRE_ICHANGED) != 0)
1099 fprintf(outfile, "Case state changes\n");
1101 if (first_char == -1)
1103 fprintf(outfile, "First char at start or follows \\n\n");
1105 else if (first_char < 0)
1107 fprintf(outfile, "No first char\n");
1111 int ch = first_char & 255;
1112 const char *caseless = ((first_char & REQ_CASELESS) == 0)?
1115 fprintf(outfile, "First char = \'%c\'%s\n", ch, caseless);
1117 fprintf(outfile, "First char = %d%s\n", ch, caseless);
1122 fprintf(outfile, "No need char\n");
1126 int ch = need_char & 255;
1127 const char *caseless = ((need_char & REQ_CASELESS) == 0)?
1130 fprintf(outfile, "Need char = \'%c\'%s\n", ch, caseless);
1132 fprintf(outfile, "Need char = %d%s\n", ch, caseless);
1135 /* Don't output study size; at present it is in any case a fixed
1136 value, but it varies, depending on the computer architecture, and
1137 so messes up the test suite. (And with the /F option, it might be
1143 fprintf(outfile, "Study returned NULL\n");
1146 uschar *start_bits = NULL;
1147 new_info(re, extra, PCRE_INFO_FIRSTTABLE, &start_bits);
1149 if (start_bits == NULL)
1150 fprintf(outfile, "No starting byte set\n");
1155 fprintf(outfile, "Starting byte set: ");
1156 for (i = 0; i < 256; i++)
1158 if ((start_bits[i/8] & (1<<(i&7))) != 0)
1162 fprintf(outfile, "\n ");
1165 if (isprint(i) && i != ' ')
1167 fprintf(outfile, "%c ", i);
1172 fprintf(outfile, "\\x%02x ", i);
1177 fprintf(outfile, "\n");
1183 /* If the '>' option was present, we write out the regex to a file, and
1184 that is all. The first 8 bytes of the file are the regex length and then
1185 the study length, in big-endian order. */
1187 if (to_file != NULL)
1189 FILE *f = fopen((char *)to_file, "wb");
1192 fprintf(outfile, "Unable to open %s: %s\n", to_file, strerror(errno));
1197 sbuf[0] = (true_size >> 24) & 255;
1198 sbuf[1] = (true_size >> 16) & 255;
1199 sbuf[2] = (true_size >> 8) & 255;
1200 sbuf[3] = (true_size) & 255;
1202 sbuf[4] = (true_study_size >> 24) & 255;
1203 sbuf[5] = (true_study_size >> 16) & 255;
1204 sbuf[6] = (true_study_size >> 8) & 255;
1205 sbuf[7] = (true_study_size) & 255;
1207 if (fwrite(sbuf, 1, 8, f) < 8 ||
1208 fwrite(re, 1, true_size, f) < true_size)
1210 fprintf(outfile, "Write error on %s: %s\n", to_file, strerror(errno));
1214 fprintf(outfile, "Compiled regex written to %s\n", to_file);
1217 if (fwrite(extra->study_data, 1, true_study_size, f) <
1220 fprintf(outfile, "Write error on %s: %s\n", to_file,
1223 else fprintf(outfile, "Study data written to %s\n", to_file);
1228 continue; /* With next regex */
1230 } /* End of non-POSIX compile */
1232 /* Read data lines and test them */
1237 unsigned char *bptr = dbuffer;
1238 int *use_offsets = offsets;
1239 int use_size_offsets = size_offsets;
1240 int callout_data = 0;
1241 int callout_data_set = 0;
1243 int copystrings = 0;
1244 int find_match_limit = 0;
1248 int start_offset = 0;
1253 pcre_callout = callout;
1257 callout_fail_count = 999999;
1258 callout_fail_id = -1;
1261 if (infile == stdin) printf("data> ");
1262 if (fgets((char *)buffer, BUFFER_SIZE, infile) == NULL)
1267 if (infile != stdin) fprintf(outfile, "%s", (char *)buffer);
1269 len = (int)strlen((char *)buffer);
1270 while (len > 0 && isspace(buffer[len-1])) len--;
1272 if (len == 0) break;
1275 while (isspace(*p)) p++;
1278 while ((c = *p++) != 0)
1283 if (c == '\\') switch ((c = *p++))
1285 case 'a': c = 7; break;
1286 case 'b': c = '\b'; break;
1287 case 'e': c = 27; break;
1288 case 'f': c = '\f'; break;
1289 case 'n': c = '\n'; break;
1290 case 'r': c = '\r'; break;
1291 case 't': c = '\t'; break;
1292 case 'v': c = '\v'; break;
1294 case '0': case '1': case '2': case '3':
1295 case '4': case '5': case '6': case '7':
1297 while (i++ < 2 && isdigit(*p) && *p != '8' && *p != '9')
1298 c = c * 8 + *p++ - '0';
1303 /* Handle \x{..} specially - new Perl thing for utf8 */
1307 unsigned char *pt = p;
1309 while (isxdigit(*(++pt)))
1310 c = c * 16 + tolower(*pt) - ((isdigit(*pt))? '0' : 'W');
1313 unsigned char buff8[8];
1315 utn = ord2utf8(c, buff8);
1316 for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
1317 c = buff8[ii]; /* Last byte */
1321 /* Not correct form; fall through */
1327 while (i++ < 2 && isxdigit(*p))
1329 c = c * 16 + tolower(*p) - ((isdigit(*p))? '0' : 'W');
1334 case 0: /* \ followed by EOF allows for an empty line */
1339 while(isdigit(*p)) start_offset = start_offset * 10 + *p++ - '0';
1342 case 'A': /* Option setting */
1343 options |= PCRE_ANCHORED;
1347 options |= PCRE_NOTBOL;
1351 if (isdigit(*p)) /* Set copy string */
1353 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1354 copystrings |= 1 << n;
1356 else if (isalnum(*p))
1360 while (isalnum(*p)) *npp++ = *p++;
1362 n = pcre_get_stringnumber(re, (char *)name);
1364 fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1365 else copystrings |= 1 << n;
1374 pcre_callout = NULL;
1379 callout_fail_id = 0;
1382 callout_fail_id = callout_fail_id * 10 + *p++ - '0';
1383 callout_fail_count = 0;
1388 callout_fail_count = callout_fail_count * 10 + *p++ - '0';
1395 if (*(++p) == '-') { sign = -1; p++; }
1397 callout_data = callout_data * 10 + *p++ - '0';
1398 callout_data *= sign;
1399 callout_data_set = 1;
1406 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1407 getstrings |= 1 << n;
1409 else if (isalnum(*p))
1413 while (isalnum(*p)) *npp++ = *p++;
1415 n = pcre_get_stringnumber(re, (char *)name);
1417 fprintf(outfile, "no parentheses with name \"%s\"\n", name);
1418 else getstrings |= 1 << n;
1427 find_match_limit = 1;
1431 options |= PCRE_NOTEMPTY;
1435 while(isdigit(*p)) n = n * 10 + *p++ - '0';
1436 if (n > size_offsets_max)
1438 size_offsets_max = n;
1440 use_offsets = offsets = (int *)malloc(size_offsets_max * sizeof(int));
1441 if (offsets == NULL)
1443 printf("** Failed to get %d bytes of memory for offsets vector\n",
1444 size_offsets_max * sizeof(int));
1448 use_size_offsets = n;
1449 if (n == 0) use_offsets = NULL; /* Ensures it can't write to it */
1453 options |= PCRE_PARTIAL;
1461 options |= PCRE_NOTEOL;
1465 options |= PCRE_NO_UTF8_CHECK;
1473 /* Handle matching via the POSIX interface, which does not
1474 support timing or playing with the match limit or callout data. */
1476 #if !defined NOPOSIX
1477 if (posix || do_posix)
1481 regmatch_t *pmatch = NULL;
1482 if (use_size_offsets > 0)
1483 pmatch = (regmatch_t *)malloc(sizeof(regmatch_t) * use_size_offsets);
1484 if ((options & PCRE_NOTBOL) != 0) eflags |= REG_NOTBOL;
1485 if ((options & PCRE_NOTEOL) != 0) eflags |= REG_NOTEOL;
1487 rc = regexec(&preg, (const char *)bptr, use_size_offsets, pmatch, eflags);
1491 (void)regerror(rc, &preg, (char *)buffer, BUFFER_SIZE);
1492 fprintf(outfile, "No match: POSIX code %d: %s\n", rc, buffer);
1497 for (i = 0; i < (size_t)use_size_offsets; i++)
1499 if (pmatch[i].rm_so >= 0)
1501 fprintf(outfile, "%2d: ", (int)i);
1502 (void)pchars(dbuffer + pmatch[i].rm_so,
1503 pmatch[i].rm_eo - pmatch[i].rm_so, outfile);
1504 fprintf(outfile, "\n");
1505 if (i == 0 && do_showrest)
1507 fprintf(outfile, " 0+ ");
1508 (void)pchars(dbuffer + pmatch[i].rm_eo, len - pmatch[i].rm_eo,
1510 fprintf(outfile, "\n");
1518 /* Handle matching via the native interface - repeats for /g and /G */
1521 #endif /* !defined NOPOSIX */
1523 for (;; gmatched++) /* Loop for /g or /G */
1529 clock_t start_time = clock();
1530 for (i = 0; i < LOOPREPEAT; i++)
1531 count = pcre_exec(re, extra, (char *)bptr, len,
1532 start_offset, options | g_notempty, use_offsets, use_size_offsets);
1533 time_taken = clock() - start_time;
1534 fprintf(outfile, "Execute time %.3f milliseconds\n",
1535 (((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
1536 (double)CLOCKS_PER_SEC);
1539 /* If find_match_limit is set, we want to do repeated matches with
1540 varying limits in order to find the minimum value. */
1542 if (find_match_limit)
1550 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1553 extra->flags |= PCRE_EXTRA_MATCH_LIMIT;
1557 extra->match_limit = mid;
1558 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1559 options | g_notempty, use_offsets, use_size_offsets);
1560 if (count == PCRE_ERROR_MATCHLIMIT)
1562 /* fprintf(outfile, "Testing match limit = %d\n", mid); */
1564 mid = (mid == max - 1)? max : (max > 0)? (min + max)/2 : mid*2;
1566 else if (count >= 0 || count == PCRE_ERROR_NOMATCH ||
1567 count == PCRE_ERROR_PARTIAL)
1571 fprintf(outfile, "Minimum match limit = %d\n", mid);
1574 /* fprintf(outfile, "Testing match limit = %d\n", mid); */
1576 mid = (min + mid)/2;
1578 else break; /* Some other error */
1581 extra->flags &= ~PCRE_EXTRA_MATCH_LIMIT;
1584 /* If callout_data is set, use the interface with additional data */
1586 else if (callout_data_set)
1590 extra = (pcre_extra *)malloc(sizeof(pcre_extra));
1593 extra->flags |= PCRE_EXTRA_CALLOUT_DATA;
1594 extra->callout_data = &callout_data;
1595 count = pcre_exec(re, extra, (char *)bptr, len, start_offset,
1596 options | g_notempty, use_offsets, use_size_offsets);
1597 extra->flags &= ~PCRE_EXTRA_CALLOUT_DATA;
1600 /* The normal case is just to do the match once, with the default
1601 value of match_limit. */
1605 count = pcre_exec(re, extra, (char *)bptr, len,
1606 start_offset, options | g_notempty, use_offsets, use_size_offsets);
1611 fprintf(outfile, "Matched, but too many substrings\n");
1612 count = use_size_offsets/3;
1620 for (i = 0; i < count * 2; i += 2)
1622 if (use_offsets[i] < 0)
1623 fprintf(outfile, "%2d: <unset>\n", i/2);
1626 fprintf(outfile, "%2d: ", i/2);
1627 (void)pchars(bptr + use_offsets[i],
1628 use_offsets[i+1] - use_offsets[i], outfile);
1629 fprintf(outfile, "\n");
1634 fprintf(outfile, " 0+ ");
1635 (void)pchars(bptr + use_offsets[i+1], len - use_offsets[i+1],
1637 fprintf(outfile, "\n");
1643 for (i = 0; i < 32; i++)
1645 if ((copystrings & (1 << i)) != 0)
1647 char copybuffer[16];
1648 int rc = pcre_copy_substring((char *)bptr, use_offsets, count,
1649 i, copybuffer, sizeof(copybuffer));
1651 fprintf(outfile, "copy substring %d failed %d\n", i, rc);
1653 fprintf(outfile, "%2dC %s (%d)\n", i, copybuffer, rc);
1657 for (i = 0; i < 32; i++)
1659 if ((getstrings & (1 << i)) != 0)
1661 const char *substring;
1662 int rc = pcre_get_substring((char *)bptr, use_offsets, count,
1665 fprintf(outfile, "get substring %d failed %d\n", i, rc);
1668 fprintf(outfile, "%2dG %s (%d)\n", i, substring, rc);
1669 /* free((void *)substring); */
1670 pcre_free_substring(substring);
1677 const char **stringlist;
1678 int rc = pcre_get_substring_list((char *)bptr, use_offsets, count,
1681 fprintf(outfile, "get substring list failed %d\n", rc);
1684 for (i = 0; i < count; i++)
1685 fprintf(outfile, "%2dL %s\n", i, stringlist[i]);
1686 if (stringlist[i] != NULL)
1687 fprintf(outfile, "string list not terminated by NULL\n");
1688 /* free((void *)stringlist); */
1689 pcre_free_substring_list(stringlist);
1694 /* There was a partial match */
1696 else if (count == PCRE_ERROR_PARTIAL)
1698 fprintf(outfile, "Partial match\n");
1699 break; /* Out of the /g loop */
1702 /* Failed to match. If this is a /g or /G loop and we previously set
1703 g_notempty after a null match, this is not necessarily the end.
1704 We want to advance the start offset, and continue. In the case of UTF-8
1705 matching, the advance must be one character, not one byte. Fudge the
1706 offset values to achieve this. We won't be at the end of the string -
1707 that was checked before setting g_notempty. */
1711 if (g_notempty != 0)
1714 use_offsets[0] = start_offset;
1717 while (start_offset + onechar < len)
1719 int tb = bptr[start_offset+onechar];
1720 if (tb <= 127) break;
1722 if (tb != 0 && tb != 0xc0) onechar++;
1725 use_offsets[1] = start_offset + onechar;
1729 if (count == PCRE_ERROR_NOMATCH)
1731 if (gmatched == 0) fprintf(outfile, "No match\n");
1733 else fprintf(outfile, "Error %d\n", count);
1734 break; /* Out of the /g loop */
1738 /* If not /g or /G we are done */
1740 if (!do_g && !do_G) break;
1742 /* If we have matched an empty string, first check to see if we are at
1743 the end of the subject. If so, the /g loop is over. Otherwise, mimic
1744 what Perl's /g options does. This turns out to be rather cunning. First
1745 we set PCRE_NOTEMPTY and PCRE_ANCHORED and try the match again at the
1746 same point. If this fails (picked up above) we advance to the next
1750 if (use_offsets[0] == use_offsets[1])
1752 if (use_offsets[0] == len) break;
1753 g_notempty = PCRE_NOTEMPTY | PCRE_ANCHORED;
1756 /* For /g, update the start offset, leaving the rest alone */
1758 if (do_g) start_offset = use_offsets[1];
1760 /* For /G, update the pointer and length */
1764 bptr += use_offsets[1];
1765 len -= use_offsets[1];
1767 } /* End of loop for /g and /G */
1768 } /* End of loop for data lines */
1772 #if !defined NOPOSIX
1773 if (posix || do_posix) regfree(&preg);
1776 if (re != NULL) free(re);
1777 if (extra != NULL) free(extra);
1780 free((void *)tables);
1781 setlocale(LC_CTYPE, "C");
1785 if (infile == stdin) fprintf(outfile, "\n");