1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2004 University of Cambridge
14 -----------------------------------------------------------------------------
15 Redistribution and use in source and binary forms, with or without
16 modification, are permitted provided that the following conditions are met:
18 * Redistributions of source code must retain the above copyright notice,
19 this list of conditions and the following disclaimer.
21 * Redistributions in binary form must reproduce the above copyright
22 notice, this list of conditions and the following disclaimer in the
23 documentation and/or other materials provided with the distribution.
25 * Neither the name of the University of Cambridge nor the names of its
26 contributors may be used to endorse or promote products derived from
27 this software without specific prior written permission.
29 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
30 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
33 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
34 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
35 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
36 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
37 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
38 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 POSSIBILITY OF SUCH DAMAGE.
40 -----------------------------------------------------------------------------
44 /* This module contains a debugging function for printing out the internal form
45 of a compiled regular expression. It is kept in a separate file so that it can
46 be #included both in the pcretest program, and in the library itself when
47 compiled with the debugging switch. */
50 static const char *OP_names[] = { OP_NAME_LIST };
53 /*************************************************
54 * Print single- or multi-byte character *
55 *************************************************/
57 /* These tables are actually copies of ones in pcre.c. If we compile the
58 library with debugging, they are included twice, but that isn't really a
59 problem - compiling with debugging is pretty rare and these are very small. */
61 static const int utf8_t3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
63 static const uschar utf8_t4[] = {
64 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
65 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
66 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
67 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
70 print_char(FILE *f, uschar *ptr, BOOL utf8)
74 if (!utf8 || (c & 0xc0) != 0xc0)
76 if (isprint(c)) fprintf(f, "%c", c); else fprintf(f, "\\x%02x", c);
82 int a = utf8_t4[c & 0x3f]; /* Number of additional bytes */
84 c = (c & utf8_t3[a]) << s;
85 for (i = 1; i <= a; i++)
87 /* This is a check for malformed UTF-8; it should only occur if the sanity
88 check has been turned off. Rather than swallow random bytes, just stop if
89 we hit a bad one. Print it with \X instead of \x as an indication. */
91 if ((ptr[i] & 0xc0) != 0x80)
93 fprintf(f, "\\X{%x}", c);
100 c |= (ptr[i] & 0x3f) << s;
102 if (c < 128) fprintf(f, "\\x%02x", c); else fprintf(f, "\\x{%x}", c);
110 /*************************************************
111 * Find Unicode property name *
112 *************************************************/
115 get_ucpname(int property)
119 for (i = sizeof(utt)/sizeof(ucp_type_table); i >= 0; i--)
121 if (property == utt[i].value) break;
123 return (i >= 0)? utt[i].name : "??";
131 /*************************************************
132 * Print compiled regex *
133 *************************************************/
135 /* Make this function work for a regex with integers either byte order.
136 However, we assume that what we are passed is a compiled regex. */
139 print_internals(pcre *external_re, FILE *f)
141 real_pcre *re = (real_pcre *)external_re;
142 uschar *codestart, *code;
145 unsigned int options = re->options;
146 int offset = re->name_table_offset;
147 int count = re->name_count;
148 int size = re->name_entry_size;
150 if (re->magic_number != MAGIC_NUMBER)
152 offset = ((offset << 8) & 0xff00) | ((offset >> 8) & 0xff);
153 count = ((count << 8) & 0xff00) | ((count >> 8) & 0xff);
154 size = ((size << 8) & 0xff00) | ((size >> 8) & 0xff);
155 options = ((options << 24) & 0xff000000) |
156 ((options << 8) & 0x00ff0000) |
157 ((options >> 8) & 0x0000ff00) |
158 ((options >> 24) & 0x000000ff);
161 code = codestart = (uschar *)re + offset + count * size;
162 utf8 = (options & PCRE_UTF8) != 0;
170 fprintf(f, "%3d ", (int)(code - codestart));
174 if (*code - OP_BRA > EXTRACT_BASIC_MAX)
175 fprintf(f, "%3d Bra extra\n", GET(code, 1));
177 fprintf(f, "%3d Bra %d\n", GET(code, 1), *code - OP_BRA);
178 code += OP_lengths[OP_BRA];
185 fprintf(f, " %s\n", OP_names[*code]);
186 fprintf(f, "------------------------------------------------------------------\n");
190 fprintf(f, " %.2x %s", code[1], OP_names[*code]);
199 code += 1 + print_char(f, code, utf8);
201 while (*code == OP_CHAR);
213 code += 1 + print_char(f, code, utf8);
215 while (*code == OP_CHARNC);
228 case OP_ASSERTBACK_NOT:
232 fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
236 printf("%3d %s", GET2(code, 1), OP_names[*code]);
240 if (GET2(code, 1) == CREF_RECURSE)
241 fprintf(f, " Cond recurse");
243 fprintf(f, "%3d %s", GET2(code,1), OP_names[*code]);
257 case OP_TYPEMINQUERY:
259 if (*code >= OP_TYPESTAR)
261 fprintf(f, "%s", OP_names[code[1]]);
262 if (code[1] == OP_PROP || code[1] == OP_NOTPROP)
264 fprintf(f, " %s ", get_ucpname(code[2]));
268 else extra = print_char(f, code+1, utf8);
269 fprintf(f, "%s", OP_names[*code]);
276 extra = print_char(f, code+3, utf8);
278 if (*code != OP_EXACT) fprintf(f, ",");
279 fprintf(f, "%d}", GET2(code,1));
280 if (*code == OP_MINUPTO) fprintf(f, "?");
286 fprintf(f, " %s", OP_names[code[3]]);
287 if (code[3] == OP_PROP || code[3] == OP_NOTPROP)
289 fprintf(f, " %s ", get_ucpname(code[4]));
293 if (*code != OP_TYPEEXACT) fprintf(f, "0,");
294 fprintf(f, "%d}", GET2(code,1));
295 if (*code == OP_TYPEMINUPTO) fprintf(f, "?");
299 if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
300 else fprintf(f, " [^\\x%02x]", c);
309 if (isprint(c = code[1])) fprintf(f, " [^%c]", c);
310 else fprintf(f, " [^\\x%02x]", c);
311 fprintf(f, "%s", OP_names[*code]);
317 if (isprint(c = code[3])) fprintf(f, " [^%c]{", c);
318 else fprintf(f, " [^\\x%02x]{", c);
319 if (*code != OP_NOTEXACT) fprintf(f, ",");
320 fprintf(f, "%d}", GET2(code,1));
321 if (*code == OP_NOTMINUPTO) fprintf(f, "?");
325 fprintf(f, "%3d %s", GET(code, 1), OP_names[*code]);
329 fprintf(f, " \\%d", GET2(code,1));
330 ccode = code + OP_lengths[*code];
331 goto CLASS_REF_REPEAT;
334 fprintf(f, " %s %d %d %d", OP_names[*code], code[1], GET(code,2),
335 GET(code, 2 + LINK_SIZE));
340 fprintf(f, " %s %s", OP_names[*code], get_ucpname(code[1]));
343 /* OP_XCLASS can only occur in UTF-8 mode. However, there's no harm in
344 having this code always here, and it makes it less messy without all those
356 if (*code == OP_XCLASS)
358 extra = GET(code, 1);
359 ccode = code + LINK_SIZE + 1;
360 printmap = (*ccode & XCL_MAP) != 0;
361 if ((*ccode++ & XCL_NOT) != 0) fprintf(f, "^");
369 /* Print a bit map */
373 for (i = 0; i < 256; i++)
375 if ((ccode[i/8] & (1 << (i&7))) != 0)
378 for (j = i+1; j < 256; j++)
379 if ((ccode[j/8] & (1 << (j&7))) == 0) break;
380 if (i == '-' || i == ']') fprintf(f, "\\");
381 if (isprint(i)) fprintf(f, "%c", i); else fprintf(f, "\\x%02x", i);
384 if (j != i + 1) fprintf(f, "-");
385 if (j == '-' || j == ']') fprintf(f, "\\");
386 if (isprint(j)) fprintf(f, "%c", j); else fprintf(f, "\\x%02x", j);
394 /* For an XCLASS there is always some additional data */
396 if (*code == OP_XCLASS)
399 while ((ch = *ccode++) != XCL_END)
403 fprintf(f, "\\p{%s}", get_ucpname(*ccode++));
405 else if (ch == XCL_NOTPROP)
407 fprintf(f, "\\P{%s}", get_ucpname(*ccode++));
411 ccode += 1 + print_char(f, ccode, TRUE);
415 ccode += 1 + print_char(f, ccode, TRUE);
421 /* Indicate a non-UTF8 class which was created by negation */
423 fprintf(f, "]%s", (*code == OP_NCLASS)? " (neg)" : "");
425 /* Handle repeats after a class or a back reference */
436 fprintf(f, "%s", OP_names[*ccode]);
437 extra += OP_lengths[*ccode];
444 if (max == 0) fprintf(f, "{%d,}", min);
445 else fprintf(f, "{%d,%d}", min, max);
446 if (*ccode == OP_CRMINRANGE) fprintf(f, "?");
447 extra += OP_lengths[*ccode];
453 /* Anything else is just an item with no data*/
456 fprintf(f, " %s", OP_names[*code]);
460 code += OP_lengths[*code] + extra;
465 /* End of printint.c */