X-Git-Url: https://git.exim.org/exim.git/blobdiff_plain/92e772ff5013bdd9cd67ed05a9cb54369a07f993..c5537c6e21da5c92ab74fc567f663becc59d3f07:/src/src/pcre/pcre_tables.c diff --git a/src/src/pcre/pcre_tables.c b/src/src/pcre/pcre_tables.c index e8120ccc5..530e44038 100644 --- a/src/src/pcre/pcre_tables.c +++ b/src/src/pcre/pcre_tables.c @@ -1,4 +1,4 @@ -/* $Cambridge: exim/src/src/pcre/pcre_tables.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */ +/* $Cambridge: exim/src/src/pcre/pcre_tables.c,v 1.6 2007/11/12 13:02:20 nm4 Exp $ */ /************************************************* * Perl-Compatible Regular Expressions * @@ -8,7 +8,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel - Copyright (c) 1997-2005 University of Cambridge + Copyright (c) 1997-2007 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -41,14 +41,20 @@ POSSIBILITY OF SUCH DAMAGE. /* This module contains some fixed tables that are used by more than one of the -PCRE code modules. */ +PCRE code modules. The tables are also #included by the pcretest program, which +uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name +clashes with the library. */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + #include "pcre_internal.h" /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that -the definition is next to the definition of the opcodes in internal.h. */ +the definition is next to the definition of the opcodes in pcre_internal.h. */ const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; @@ -61,6 +67,8 @@ const uschar _pcre_OP_lengths[] = { OP_LENGTHS }; /* These are the breakpoints for different numbers of bytes in a UTF-8 character. */ +#ifdef SUPPORT_UTF8 + const int _pcre_utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; @@ -72,9 +80,8 @@ first byte of a character, indexed by the number of additional bytes. */ const int _pcre_utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; -/* Table of the number of extra characters, indexed by the first character -masked with 0x3f. The highest number for a valid UTF-8 character is in fact -0x3d. */ +/* Table of the number of extra bytes, indexed by the first byte masked with +0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ const uschar _pcre_utf8_table4[] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, @@ -82,50 +89,232 @@ const uschar _pcre_utf8_table4[] = { 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; -/* This table translates Unicode property names into code values for the -ucp_findchar() function. It is used by pcretest as well as by the library -functions. */ +/* The pcre_utt[] table below translates Unicode property names into type and +code values. It is searched by binary chop, so must be in collating sequence of +name. Originally, the table contained pointers to the name strings in the first +field of each entry. However, that leads to a large number of relocations when +a shared library is dynamically loaded. A significant reduction is made by +putting all the names into a single, large string and then using offsets in the +table itself. Maintenance is more error-prone, but frequent changes to this +data is unlikely. */ + +const char _pcre_utt_names[] = + "Any\0" + "Arabic\0" + "Armenian\0" + "Balinese\0" + "Bengali\0" + "Bopomofo\0" + "Braille\0" + "Buginese\0" + "Buhid\0" + "C\0" + "Canadian_Aboriginal\0" + "Cc\0" + "Cf\0" + "Cherokee\0" + "Cn\0" + "Co\0" + "Common\0" + "Coptic\0" + "Cs\0" + "Cuneiform\0" + "Cypriot\0" + "Cyrillic\0" + "Deseret\0" + "Devanagari\0" + "Ethiopic\0" + "Georgian\0" + "Glagolitic\0" + "Gothic\0" + "Greek\0" + "Gujarati\0" + "Gurmukhi\0" + "Han\0" + "Hangul\0" + "Hanunoo\0" + "Hebrew\0" + "Hiragana\0" + "Inherited\0" + "Kannada\0" + "Katakana\0" + "Kharoshthi\0" + "Khmer\0" + "L\0" + "L&\0" + "Lao\0" + "Latin\0" + "Limbu\0" + "Linear_B\0" + "Ll\0" + "Lm\0" + "Lo\0" + "Lt\0" + "Lu\0" + "M\0" + "Malayalam\0" + "Mc\0" + "Me\0" + "Mn\0" + "Mongolian\0" + "Myanmar\0" + "N\0" + "Nd\0" + "New_Tai_Lue\0" + "Nko\0" + "Nl\0" + "No\0" + "Ogham\0" + "Old_Italic\0" + "Old_Persian\0" + "Oriya\0" + "Osmanya\0" + "P\0" + "Pc\0" + "Pd\0" + "Pe\0" + "Pf\0" + "Phags_Pa\0" + "Phoenician\0" + "Pi\0" + "Po\0" + "Ps\0" + "Runic\0" + "S\0" + "Sc\0" + "Shavian\0" + "Sinhala\0" + "Sk\0" + "Sm\0" + "So\0" + "Syloti_Nagri\0" + "Syriac\0" + "Tagalog\0" + "Tagbanwa\0" + "Tai_Le\0" + "Tamil\0" + "Telugu\0" + "Thaana\0" + "Thai\0" + "Tibetan\0" + "Tifinagh\0" + "Ugaritic\0" + "Yi\0" + "Z\0" + "Zl\0" + "Zp\0" + "Zs\0"; const ucp_type_table _pcre_utt[] = { - { "C", 128 + ucp_C }, - { "Cc", ucp_Cc }, - { "Cf", ucp_Cf }, - { "Cn", ucp_Cn }, - { "Co", ucp_Co }, - { "Cs", ucp_Cs }, - { "L", 128 + ucp_L }, - { "Ll", ucp_Ll }, - { "Lm", ucp_Lm }, - { "Lo", ucp_Lo }, - { "Lt", ucp_Lt }, - { "Lu", ucp_Lu }, - { "M", 128 + ucp_M }, - { "Mc", ucp_Mc }, - { "Me", ucp_Me }, - { "Mn", ucp_Mn }, - { "N", 128 + ucp_N }, - { "Nd", ucp_Nd }, - { "Nl", ucp_Nl }, - { "No", ucp_No }, - { "P", 128 + ucp_P }, - { "Pc", ucp_Pc }, - { "Pd", ucp_Pd }, - { "Pe", ucp_Pe }, - { "Pf", ucp_Pf }, - { "Pi", ucp_Pi }, - { "Po", ucp_Po }, - { "Ps", ucp_Ps }, - { "S", 128 + ucp_S }, - { "Sc", ucp_Sc }, - { "Sk", ucp_Sk }, - { "Sm", ucp_Sm }, - { "So", ucp_So }, - { "Z", 128 + ucp_Z }, - { "Zl", ucp_Zl }, - { "Zp", ucp_Zp }, - { "Zs", ucp_Zs } + { 0, PT_ANY, 0 }, + { 4, PT_SC, ucp_Arabic }, + { 11, PT_SC, ucp_Armenian }, + { 20, PT_SC, ucp_Balinese }, + { 29, PT_SC, ucp_Bengali }, + { 37, PT_SC, ucp_Bopomofo }, + { 46, PT_SC, ucp_Braille }, + { 54, PT_SC, ucp_Buginese }, + { 63, PT_SC, ucp_Buhid }, + { 69, PT_GC, ucp_C }, + { 71, PT_SC, ucp_Canadian_Aboriginal }, + { 91, PT_PC, ucp_Cc }, + { 94, PT_PC, ucp_Cf }, + { 97, PT_SC, ucp_Cherokee }, + { 106, PT_PC, ucp_Cn }, + { 109, PT_PC, ucp_Co }, + { 112, PT_SC, ucp_Common }, + { 119, PT_SC, ucp_Coptic }, + { 126, PT_PC, ucp_Cs }, + { 129, PT_SC, ucp_Cuneiform }, + { 139, PT_SC, ucp_Cypriot }, + { 147, PT_SC, ucp_Cyrillic }, + { 156, PT_SC, ucp_Deseret }, + { 164, PT_SC, ucp_Devanagari }, + { 175, PT_SC, ucp_Ethiopic }, + { 184, PT_SC, ucp_Georgian }, + { 193, PT_SC, ucp_Glagolitic }, + { 204, PT_SC, ucp_Gothic }, + { 211, PT_SC, ucp_Greek }, + { 217, PT_SC, ucp_Gujarati }, + { 226, PT_SC, ucp_Gurmukhi }, + { 235, PT_SC, ucp_Han }, + { 239, PT_SC, ucp_Hangul }, + { 246, PT_SC, ucp_Hanunoo }, + { 254, PT_SC, ucp_Hebrew }, + { 261, PT_SC, ucp_Hiragana }, + { 270, PT_SC, ucp_Inherited }, + { 280, PT_SC, ucp_Kannada }, + { 288, PT_SC, ucp_Katakana }, + { 297, PT_SC, ucp_Kharoshthi }, + { 308, PT_SC, ucp_Khmer }, + { 314, PT_GC, ucp_L }, + { 316, PT_LAMP, 0 }, + { 319, PT_SC, ucp_Lao }, + { 323, PT_SC, ucp_Latin }, + { 329, PT_SC, ucp_Limbu }, + { 335, PT_SC, ucp_Linear_B }, + { 344, PT_PC, ucp_Ll }, + { 347, PT_PC, ucp_Lm }, + { 350, PT_PC, ucp_Lo }, + { 353, PT_PC, ucp_Lt }, + { 356, PT_PC, ucp_Lu }, + { 359, PT_GC, ucp_M }, + { 361, PT_SC, ucp_Malayalam }, + { 371, PT_PC, ucp_Mc }, + { 374, PT_PC, ucp_Me }, + { 377, PT_PC, ucp_Mn }, + { 380, PT_SC, ucp_Mongolian }, + { 390, PT_SC, ucp_Myanmar }, + { 398, PT_GC, ucp_N }, + { 400, PT_PC, ucp_Nd }, + { 403, PT_SC, ucp_New_Tai_Lue }, + { 415, PT_SC, ucp_Nko }, + { 419, PT_PC, ucp_Nl }, + { 422, PT_PC, ucp_No }, + { 425, PT_SC, ucp_Ogham }, + { 431, PT_SC, ucp_Old_Italic }, + { 442, PT_SC, ucp_Old_Persian }, + { 454, PT_SC, ucp_Oriya }, + { 460, PT_SC, ucp_Osmanya }, + { 468, PT_GC, ucp_P }, + { 470, PT_PC, ucp_Pc }, + { 473, PT_PC, ucp_Pd }, + { 476, PT_PC, ucp_Pe }, + { 479, PT_PC, ucp_Pf }, + { 482, PT_SC, ucp_Phags_Pa }, + { 491, PT_SC, ucp_Phoenician }, + { 502, PT_PC, ucp_Pi }, + { 505, PT_PC, ucp_Po }, + { 508, PT_PC, ucp_Ps }, + { 511, PT_SC, ucp_Runic }, + { 517, PT_GC, ucp_S }, + { 519, PT_PC, ucp_Sc }, + { 522, PT_SC, ucp_Shavian }, + { 530, PT_SC, ucp_Sinhala }, + { 538, PT_PC, ucp_Sk }, + { 541, PT_PC, ucp_Sm }, + { 544, PT_PC, ucp_So }, + { 547, PT_SC, ucp_Syloti_Nagri }, + { 560, PT_SC, ucp_Syriac }, + { 567, PT_SC, ucp_Tagalog }, + { 575, PT_SC, ucp_Tagbanwa }, + { 584, PT_SC, ucp_Tai_Le }, + { 591, PT_SC, ucp_Tamil }, + { 597, PT_SC, ucp_Telugu }, + { 604, PT_SC, ucp_Thaana }, + { 611, PT_SC, ucp_Thai }, + { 616, PT_SC, ucp_Tibetan }, + { 624, PT_SC, ucp_Tifinagh }, + { 633, PT_SC, ucp_Ugaritic }, + { 642, PT_SC, ucp_Yi }, + { 645, PT_GC, ucp_Z }, + { 647, PT_PC, ucp_Zl }, + { 650, PT_PC, ucp_Zp }, + { 653, PT_PC, ucp_Zs } }; const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table); +#endif /* SUPPORT_UTF8 */ + /* End of pcre_tables.c */