+/* $Cambridge: exim/src/src/pcre/pcretest.c,v 1.2 2005/06/15 08:57:10 ph10 Exp $ */
+
/*************************************************
* PCRE testing program *
*************************************************/
/* This program was hacked up as a tester for PCRE. I really should have
written it more tidily in the first place. Will I ever learn? It has grown and
-been extended and consequently is now rather untidy in places.
+been extended and consequently is now rather, er, *very* untidy in places.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
#include <locale.h>
#include <errno.h>
-/* We need the internal info for displaying the results of pcre_study(). Also
-for getting the opcodes for showing compiled code. */
-
#define PCRE_SPY /* For Win32 build, import data, not export */
-#include "internal.h"
+
+/* We need the internal info for displaying the results of pcre_study() and
+other internal data; pcretest also uses some of the fixed tables, and generally
+has "inside information" compared to a program that strictly follows the PCRE
+API. */
+
+#include "pcre_internal.h"
+
/* It is possible to compile this test program without including support for
testing the POSIX interface, though this is not available via the standard
#include "pcreposix.h"
#endif
+/* It is also possible, for the benefit of the version imported into Exim, to
+build pcretest without support for UTF8 (define NOUTF8), without the interface
+to the DFA matcher (NODFA), and without the doublecheck of the old "info"
+function (define NOINFOCHECK). */
+
+
#ifndef CLOCKS_PER_SEC
#ifdef CLK_TCK
#define CLOCKS_PER_SEC CLK_TCK
static uschar *pbuffer = NULL;
-static const int utf8_table1[] = {
- 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
-
-static const int utf8_table2[] = {
- 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-
-static const int utf8_table3[] = {
- 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
-
-
-
-/*************************************************
-* Print compiled regex *
-*************************************************/
-
-/* The code for doing this is held in a separate file that is also included in
-pcre.c when it is compiled with the debug switch. It defines a function called
-print_internals(), which uses a table of opcode lengths defined by the macro
-OP_LENGTHS, whose name must be OP_lengths. It also uses a table that translates
-Unicode property names to numbers; this is kept in a separate file. */
-
-static uschar OP_lengths[] = { OP_LENGTHS };
-
-#ifdef SUPPORT_UCP
-#include "ucp.h"
-#include "ucptypetable.c"
-#endif
-
-#include "printint.c"
-
-
/*************************************************
* Read number from string *
-/*************************************************
-* Convert character value to UTF-8 *
-*************************************************/
-
-/* This function takes an integer value in the range 0 - 0x7fffffff
-and encodes it as a UTF-8 character in 0 to 6 bytes.
-
-Arguments:
- cvalue the character value
- buffer pointer to buffer for result - at least 6 bytes long
-
-Returns: number of characters placed in the buffer
- -1 if input character is negative
- 0 if input character is positive but too big (only when
- int is longer than 32 bits)
-*/
-
-static int
-ord2utf8(int cvalue, unsigned char *buffer)
-{
-register int i, j;
-for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
- if (cvalue <= utf8_table1[i]) break;
-if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
-if (cvalue < 0) return -1;
-
-buffer += i;
-for (j = i; j > 0; j--)
- {
- *buffer-- = 0x80 | (cvalue & 0x3f);
- cvalue >>= 6;
- }
-*buffer = utf8_table2[i] | cvalue;
-return i + 1;
-}
-
/*************************************************
* Convert UTF-8 string to value *
-6 to 0 => malformed UTF-8 character at offset = (-return)
*/
+#if !defined NOUTF8
+
static int
utf82ord(unsigned char *buffer, int *vptr)
{
/* i now has a value in the range 1-5 */
s = 6*i;
-d = (c & utf8_table3[i]) << s;
+d = (c & _pcre_utf8_table3[i]) << s;
for (j = 0; j < i; j++)
{
/* Check that encoding was the correct unique one */
-for (j = 0; j < sizeof(utf8_table1)/sizeof(int); j++)
- if (d <= utf8_table1[j]) break;
+for (j = 0; j < _pcre_utf8_table1_size; j++)
+ if (d <= _pcre_utf8_table1[j]) break;
if (j != i) return -(i+1);
/* Valid value */
return i+1;
}
+#endif
+
/*************************************************
while (length-- > 0)
{
+#if !defined NOUTF8
if (use_utf8)
{
int rc = utf82ord(p, &c);
continue;
}
}
+#endif
/* Not UTF-8, or malformed UTF-8 */
int showstore = 0;
int size_offsets = 45;
int size_offsets_max;
-int *offsets;
+int *offsets = NULL;
#if !defined NOPOSIX
int posix = 0;
#endif
int debug = 0;
int done = 0;
+int all_use_dfa = 0;
+int yield = 0;
unsigned char *buffer;
unsigned char *dbuffer;
else if (strcmp(argv[op], "-t") == 0) timeit = 1;
else if (strcmp(argv[op], "-i") == 0) showinfo = 1;
else if (strcmp(argv[op], "-d") == 0) showinfo = debug = 1;
+#if !defined NODFA
+ else if (strcmp(argv[op], "-dfa") == 0) all_use_dfa = 1;
+#endif
else if (strcmp(argv[op], "-o") == 0 && argc > 2 &&
((size_offsets = get_value((unsigned char *)argv[op+1], &endptr)),
*endptr == 0))
printf("** Unknown or malformed option %s\n", argv[op]);
printf("Usage: pcretest [-d] [-i] [-o <n>] [-p] [-s] [-t] [<input> [<output>]]\n");
printf(" -C show PCRE compile-time options and exit\n");
- printf(" -d debug: show compiled code; implies -i\n"
- " -i show information about compiled pattern\n"
+ printf(" -d debug: show compiled code; implies -i\n");
+#if !defined NODFA
+ printf(" -dfa force DFA matching for all subjects\n");
+#endif
+ printf(" -i show information about compiled pattern\n"
" -m output memory used information\n"
" -o <n> set size of offsets vector to <n>\n");
#if !defined NOPOSIX
#endif
printf(" -s output store (memory) used information\n"
" -t time compilation and execution\n");
- return 1;
+ yield = 1;
+ goto EXIT;
}
op++;
argc--;
{
printf("** Failed to get %d bytes of memory for offsets vector\n",
size_offsets_max * sizeof(int));
- return 1;
+ yield = 1;
+ goto EXIT;
}
/* Sort out the input and output files */
if (infile == NULL)
{
printf("** Failed to open %s\n", argv[op]);
- return 1;
+ yield = 1;
+ goto EXIT;
}
}
if (outfile == NULL)
{
printf("** Failed to open %s\n", argv[op+1]);
- return 1;
+ yield = 1;
+ goto EXIT;
}
}
{
switch (*pp++)
{
+ case 'f': options |= PCRE_FIRSTLINE; break;
case 'g': do_g = 1; break;
case 'i': options |= PCRE_CASELESS; break;
case 'm': options |= PCRE_MULTILINE; break;
case 'L':
ppp = pp;
- while (*ppp != '\n' && *ppp != ' ') ppp++;
+ /* The '\r' test here is so that it works on Windows */
+ while (*ppp != '\n' && *ppp != '\r' && *ppp != ' ') ppp++;
*ppp = 0;
if (setlocale(LC_CTYPE, (const char *)pp) == NULL)
{
*pp = 0;
break;
- case '\n': case ' ': break;
+ case '\r': /* So that it works in Windows */
+ case '\n':
+ case ' ':
+ break;
default:
fprintf(outfile, "** Unknown option '%c'\n", pp[-1]);
if ((options & PCRE_CASELESS) != 0) cflags |= REG_ICASE;
if ((options & PCRE_MULTILINE) != 0) cflags |= REG_NEWLINE;
+ if ((options & PCRE_DOTALL) != 0) cflags |= REG_DOTALL;
rc = regcomp(&preg, (char *)p, cflags);
/* Compilation failed; go back for another re, skipping to blank line
if (do_showinfo)
{
unsigned long int get_options, all_options;
+#if !defined NOINFOCHECK
int old_first_char, old_options, old_count;
+#endif
int count, backrefmax, first_char, need_char;
int nameentrysize, namecount;
const uschar *nametable;
if (do_debug)
{
fprintf(outfile, "------------------------------------------------------------------\n");
- print_internals(re, outfile);
+ _pcre_printint(re, outfile);
}
new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options);
new_info(re, NULL, PCRE_INFO_NAMECOUNT, &namecount);
new_info(re, NULL, PCRE_INFO_NAMETABLE, (void *)&nametable);
+#if !defined NOINFOCHECK
old_count = pcre_info(re, &old_options, &old_first_char);
if (count < 0) fprintf(outfile,
"Error %d from pcre_info()\n", count);
"Options disagreement: pcre_fullinfo=%ld pcre_info=%d\n",
get_options, old_options);
}
+#endif
if (size != regex_gotten_store) fprintf(outfile,
"Size disagreement: pcre_fullinfo=%d call to malloc for %d\n",
fprintf(outfile, "Partial matching not supported\n");
if (get_options == 0) fprintf(outfile, "No options\n");
- else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s\n",
+ else fprintf(outfile, "Options:%s%s%s%s%s%s%s%s%s%s%s\n",
((get_options & PCRE_ANCHORED) != 0)? " anchored" : "",
((get_options & PCRE_CASELESS) != 0)? " caseless" : "",
((get_options & PCRE_EXTENDED) != 0)? " extended" : "",
((get_options & PCRE_MULTILINE) != 0)? " multiline" : "",
+ ((get_options & PCRE_FIRSTLINE) != 0)? " firstline" : "",
((get_options & PCRE_DOTALL) != 0)? " dotall" : "",
((get_options & PCRE_DOLLAR_ENDONLY) != 0)? " dollar_endonly" : "",
((get_options & PCRE_EXTRA) != 0)? " extra" : "",
}
fclose(f);
}
+
+ new_free(re);
+ if (extra != NULL) new_free(extra);
+ if (tables != NULL) new_free((void *)tables);
continue; /* With next regex */
}
} /* End of non-POSIX compile */
int gmatched = 0;
int start_offset = 0;
int g_notempty = 0;
+ int use_dfa = 0;
options = 0;
/* Handle \x{..} specially - new Perl thing for utf8 */
+#if !defined NOUTF8
if (*p == '{')
{
unsigned char *pt = p;
{
unsigned char buff8[8];
int ii, utn;
- utn = ord2utf8(c, buff8);
+ utn = _pcre_ord2utf8(c, buff8);
for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii];
c = buff8[ii]; /* Last byte */
p = pt + 1;
}
/* Not correct form; fall through */
}
+#endif
/* Ordinary \x */
}
continue;
+#if !defined NODFA
+ case 'D':
+#if !defined NOPOSIX
+ if (posix || do_posix)
+ printf("** Can't use dfa matching in POSIX mode: \\D ignored\n");
+ else
+#endif
+ use_dfa = 1;
+ continue;
+
+ case 'F':
+ options |= PCRE_DFA_SHORTEST;
+ continue;
+#endif
+
case 'G':
if (isdigit(*p))
{
{
printf("** Failed to get %d bytes of memory for offsets vector\n",
size_offsets_max * sizeof(int));
- return 1;
+ yield = 1;
+ goto EXIT;
}
}
use_size_offsets = n;
options |= PCRE_PARTIAL;
continue;
+#if !defined NODFA
+ case 'R':
+ options |= PCRE_DFA_RESTART;
+ continue;
+#endif
+
case 'S':
show_malloc = 1;
continue;
*q = 0;
len = q - dbuffer;
+ if ((all_use_dfa || use_dfa) && find_match_limit)
+ {
+ printf("**Match limit not relevant for DFA matching: ignored\n");
+ find_match_limit = 0;
+ }
+
/* Handle matching via the POSIX interface, which does not
support timing or playing with the match limit or callout data. */
register int i;
clock_t time_taken;
clock_t start_time = clock();
+
+#if !defined NODFA
+ if (all_use_dfa || use_dfa)
+ {
+ int workspace[1000];
+ for (i = 0; i < LOOPREPEAT; i++)
+ count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset,
+ options | g_notempty, use_offsets, use_size_offsets, workspace,
+ sizeof(workspace)/sizeof(int));
+ }
+ else
+#endif
+
for (i = 0; i < LOOPREPEAT; i++)
count = pcre_exec(re, extra, (char *)bptr, len,
start_offset, options | g_notempty, use_offsets, use_size_offsets);
+
time_taken = clock() - start_time;
fprintf(outfile, "Execute time %.3f milliseconds\n",
(((double)time_taken * 1000.0) / (double)LOOPREPEAT) /
/* The normal case is just to do the match once, with the default
value of match_limit. */
- else
+#if !defined NODFA
+ else if (all_use_dfa || use_dfa)
{
- count = pcre_exec(re, extra, (char *)bptr, len,
- start_offset, options | g_notempty, use_offsets, use_size_offsets);
+ int workspace[1000];
+ count = pcre_dfa_exec(re, NULL, (char *)bptr, len, start_offset,
+ options | g_notempty, use_offsets, use_size_offsets, workspace,
+ sizeof(workspace)/sizeof(int));
+ if (count == 0)
+ {
+ fprintf(outfile, "Matched, but too many subsidiary matches\n");
+ count = use_size_offsets/2;
+ }
}
+#endif
- if (count == 0)
+ else
{
- fprintf(outfile, "Matched, but too many substrings\n");
- count = use_size_offsets/3;
+ count = pcre_exec(re, extra, (char *)bptr, len,
+ start_offset, options | g_notempty, use_offsets, use_size_offsets);
+ if (count == 0)
+ {
+ fprintf(outfile, "Matched, but too many substrings\n");
+ count = use_size_offsets/3;
+ }
}
/* Matched */
else if (count == PCRE_ERROR_PARTIAL)
{
- fprintf(outfile, "Partial match\n");
+ fprintf(outfile, "Partial match");
+#if !defined NODFA
+ if ((all_use_dfa || use_dfa) && use_size_offsets > 2)
+ fprintf(outfile, ": %.*s", use_offsets[1] - use_offsets[0],
+ bptr + use_offsets[0]);
+#endif
+ fprintf(outfile, "\n");
break; /* Out of the /g loop */
}
if (posix || do_posix) regfree(&preg);
#endif
- if (re != NULL) free(re);
- if (extra != NULL) free(extra);
+ if (re != NULL) new_free(re);
+ if (extra != NULL) new_free(extra);
if (tables != NULL)
{
- free((void *)tables);
+ new_free((void *)tables);
setlocale(LC_CTYPE, "C");
}
}
if (infile == stdin) fprintf(outfile, "\n");
-return 0;
+
+EXIT:
+
+if (infile != NULL && infile != stdin) fclose(infile);
+if (outfile != NULL && outfile != stdout) fclose(outfile);
+
+free(buffer);
+free(dbuffer);
+free(pbuffer);
+free(offsets);
+
+return yield;
}
-/* End */
+/* End of pcretest.c */