X-Git-Url: https://git.exim.org/exim.git/blobdiff_plain/92e772ff5013bdd9cd67ed05a9cb54369a07f993..c5537c6e21da5c92ab74fc567f663becc59d3f07:/src/src/pcre/pcre_tables.c

diff --git a/src/src/pcre/pcre_tables.c b/src/src/pcre/pcre_tables.c
index e8120ccc5..530e44038 100644
--- a/src/src/pcre/pcre_tables.c
+++ b/src/src/pcre/pcre_tables.c
@@ -1,4 +1,4 @@
-/* $Cambridge: exim/src/src/pcre/pcre_tables.c,v 1.2 2005/08/08 10:22:14 ph10 Exp $ */
+/* $Cambridge: exim/src/src/pcre/pcre_tables.c,v 1.6 2007/11/12 13:02:20 nm4 Exp $ */
 
 /*************************************************
 *      Perl-Compatible Regular Expressions       *
@@ -8,7 +8,7 @@
 and semantics are as close as possible to those of the Perl 5 language.
 
                        Written by Philip Hazel
-           Copyright (c) 1997-2005 University of Cambridge
+           Copyright (c) 1997-2007 University of Cambridge
 
 -----------------------------------------------------------------------------
 Redistribution and use in source and binary forms, with or without
@@ -41,14 +41,20 @@ POSSIBILITY OF SUCH DAMAGE.
 
 
 /* This module contains some fixed tables that are used by more than one of the
-PCRE code modules. */
+PCRE code modules. The tables are also #included by the pcretest program, which
+uses macros to change their names from _pcre_xxx to xxxx, thereby avoiding name
+clashes with the library. */
 
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
 #include "pcre_internal.h"
 
 
 /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that
-the definition is next to the definition of the opcodes in internal.h. */
+the definition is next to the definition of the opcodes in pcre_internal.h. */
 
 const uschar _pcre_OP_lengths[] = { OP_LENGTHS };
 
@@ -61,6 +67,8 @@ const uschar _pcre_OP_lengths[] = { OP_LENGTHS };
 /* These are the breakpoints for different numbers of bytes in a UTF-8
 character. */
 
+#ifdef SUPPORT_UTF8
+
 const int _pcre_utf8_table1[] =
   { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
 
@@ -72,9 +80,8 @@ first byte of a character, indexed by the number of additional bytes. */
 const int _pcre_utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
 const int _pcre_utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
 
-/* Table of the number of extra characters, indexed by the first character
-masked with 0x3f. The highest number for a valid UTF-8 character is in fact
-0x3d. */
+/* Table of the number of extra bytes, indexed by the first byte masked with
+0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */
 
 const uschar _pcre_utf8_table4[] = {
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -82,50 +89,232 @@ const uschar _pcre_utf8_table4[] = {
   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
 
-/* This table translates Unicode property names into code values for the
-ucp_findchar() function. It is used by pcretest as well as by the library
-functions. */
+/* The pcre_utt[] table below translates Unicode property names into type and
+code values. It is searched by binary chop, so must be in collating sequence of
+name. Originally, the table contained pointers to the name strings in the first
+field of each entry. However, that leads to a large number of relocations when
+a shared library is dynamically loaded. A significant reduction is made by
+putting all the names into a single, large string and then using offsets in the
+table itself. Maintenance is more error-prone, but frequent changes to this
+data is unlikely. */
+
+const char _pcre_utt_names[] =
+  "Any\0"
+  "Arabic\0"
+  "Armenian\0"
+  "Balinese\0"
+  "Bengali\0"
+  "Bopomofo\0"
+  "Braille\0"
+  "Buginese\0"
+  "Buhid\0"
+  "C\0"
+  "Canadian_Aboriginal\0"
+  "Cc\0"
+  "Cf\0"
+  "Cherokee\0"
+  "Cn\0"
+  "Co\0"
+  "Common\0"
+  "Coptic\0"
+  "Cs\0"
+  "Cuneiform\0"
+  "Cypriot\0"
+  "Cyrillic\0"
+  "Deseret\0"
+  "Devanagari\0"
+  "Ethiopic\0"
+  "Georgian\0"
+  "Glagolitic\0"
+  "Gothic\0"
+  "Greek\0"
+  "Gujarati\0"
+  "Gurmukhi\0"
+  "Han\0"
+  "Hangul\0"
+  "Hanunoo\0"
+  "Hebrew\0"
+  "Hiragana\0"
+  "Inherited\0"
+  "Kannada\0"
+  "Katakana\0"
+  "Kharoshthi\0"
+  "Khmer\0"
+  "L\0"
+  "L&\0"
+  "Lao\0"
+  "Latin\0"
+  "Limbu\0"
+  "Linear_B\0"
+  "Ll\0"
+  "Lm\0"
+  "Lo\0"
+  "Lt\0"
+  "Lu\0"
+  "M\0"
+  "Malayalam\0"
+  "Mc\0"
+  "Me\0"
+  "Mn\0"
+  "Mongolian\0"
+  "Myanmar\0"
+  "N\0"
+  "Nd\0"
+  "New_Tai_Lue\0"
+  "Nko\0"
+  "Nl\0"
+  "No\0"
+  "Ogham\0"
+  "Old_Italic\0"
+  "Old_Persian\0"
+  "Oriya\0"
+  "Osmanya\0"
+  "P\0"
+  "Pc\0"
+  "Pd\0"
+  "Pe\0"
+  "Pf\0"
+  "Phags_Pa\0"
+  "Phoenician\0"
+  "Pi\0"
+  "Po\0"
+  "Ps\0"
+  "Runic\0"
+  "S\0"
+  "Sc\0"
+  "Shavian\0"
+  "Sinhala\0"
+  "Sk\0"
+  "Sm\0"
+  "So\0"
+  "Syloti_Nagri\0"
+  "Syriac\0"
+  "Tagalog\0"
+  "Tagbanwa\0"
+  "Tai_Le\0"
+  "Tamil\0"
+  "Telugu\0"
+  "Thaana\0"
+  "Thai\0"
+  "Tibetan\0"
+  "Tifinagh\0"
+  "Ugaritic\0"
+  "Yi\0"
+  "Z\0"
+  "Zl\0"
+  "Zp\0"
+  "Zs\0";
 
 const ucp_type_table _pcre_utt[] = {
-  { "C",  128 + ucp_C },
-  { "Cc", ucp_Cc },
-  { "Cf", ucp_Cf },
-  { "Cn", ucp_Cn },
-  { "Co", ucp_Co },
-  { "Cs", ucp_Cs },
-  { "L",  128 + ucp_L },
-  { "Ll", ucp_Ll },
-  { "Lm", ucp_Lm },
-  { "Lo", ucp_Lo },
-  { "Lt", ucp_Lt },
-  { "Lu", ucp_Lu },
-  { "M",  128 + ucp_M },
-  { "Mc", ucp_Mc },
-  { "Me", ucp_Me },
-  { "Mn", ucp_Mn },
-  { "N",  128 + ucp_N },
-  { "Nd", ucp_Nd },
-  { "Nl", ucp_Nl },
-  { "No", ucp_No },
-  { "P",  128 + ucp_P },
-  { "Pc", ucp_Pc },
-  { "Pd", ucp_Pd },
-  { "Pe", ucp_Pe },
-  { "Pf", ucp_Pf },
-  { "Pi", ucp_Pi },
-  { "Po", ucp_Po },
-  { "Ps", ucp_Ps },
-  { "S",  128 + ucp_S },
-  { "Sc", ucp_Sc },
-  { "Sk", ucp_Sk },
-  { "Sm", ucp_Sm },
-  { "So", ucp_So },
-  { "Z",  128 + ucp_Z },
-  { "Zl", ucp_Zl },
-  { "Zp", ucp_Zp },
-  { "Zs", ucp_Zs }
+  { 0,   PT_ANY, 0 },
+  { 4,   PT_SC, ucp_Arabic },
+  { 11,  PT_SC, ucp_Armenian },
+  { 20,  PT_SC, ucp_Balinese },
+  { 29,  PT_SC, ucp_Bengali },
+  { 37,  PT_SC, ucp_Bopomofo },
+  { 46,  PT_SC, ucp_Braille },
+  { 54,  PT_SC, ucp_Buginese },
+  { 63,  PT_SC, ucp_Buhid },
+  { 69,  PT_GC, ucp_C },
+  { 71,  PT_SC, ucp_Canadian_Aboriginal },
+  { 91,  PT_PC, ucp_Cc },
+  { 94,  PT_PC, ucp_Cf },
+  { 97,  PT_SC, ucp_Cherokee },
+  { 106, PT_PC, ucp_Cn },
+  { 109, PT_PC, ucp_Co },
+  { 112, PT_SC, ucp_Common },
+  { 119, PT_SC, ucp_Coptic },
+  { 126, PT_PC, ucp_Cs },
+  { 129, PT_SC, ucp_Cuneiform },
+  { 139, PT_SC, ucp_Cypriot },
+  { 147, PT_SC, ucp_Cyrillic },
+  { 156, PT_SC, ucp_Deseret },
+  { 164, PT_SC, ucp_Devanagari },
+  { 175, PT_SC, ucp_Ethiopic },
+  { 184, PT_SC, ucp_Georgian },
+  { 193, PT_SC, ucp_Glagolitic },
+  { 204, PT_SC, ucp_Gothic },
+  { 211, PT_SC, ucp_Greek },
+  { 217, PT_SC, ucp_Gujarati },
+  { 226, PT_SC, ucp_Gurmukhi },
+  { 235, PT_SC, ucp_Han },
+  { 239, PT_SC, ucp_Hangul },
+  { 246, PT_SC, ucp_Hanunoo },
+  { 254, PT_SC, ucp_Hebrew },
+  { 261, PT_SC, ucp_Hiragana },
+  { 270, PT_SC, ucp_Inherited },
+  { 280, PT_SC, ucp_Kannada },
+  { 288, PT_SC, ucp_Katakana },
+  { 297, PT_SC, ucp_Kharoshthi },
+  { 308, PT_SC, ucp_Khmer },
+  { 314, PT_GC, ucp_L },
+  { 316, PT_LAMP, 0 },
+  { 319, PT_SC, ucp_Lao },
+  { 323, PT_SC, ucp_Latin },
+  { 329, PT_SC, ucp_Limbu },
+  { 335, PT_SC, ucp_Linear_B },
+  { 344, PT_PC, ucp_Ll },
+  { 347, PT_PC, ucp_Lm },
+  { 350, PT_PC, ucp_Lo },
+  { 353, PT_PC, ucp_Lt },
+  { 356, PT_PC, ucp_Lu },
+  { 359, PT_GC, ucp_M },
+  { 361, PT_SC, ucp_Malayalam },
+  { 371, PT_PC, ucp_Mc },
+  { 374, PT_PC, ucp_Me },
+  { 377, PT_PC, ucp_Mn },
+  { 380, PT_SC, ucp_Mongolian },
+  { 390, PT_SC, ucp_Myanmar },
+  { 398, PT_GC, ucp_N },
+  { 400, PT_PC, ucp_Nd },
+  { 403, PT_SC, ucp_New_Tai_Lue },
+  { 415, PT_SC, ucp_Nko },
+  { 419, PT_PC, ucp_Nl },
+  { 422, PT_PC, ucp_No },
+  { 425, PT_SC, ucp_Ogham },
+  { 431, PT_SC, ucp_Old_Italic },
+  { 442, PT_SC, ucp_Old_Persian },
+  { 454, PT_SC, ucp_Oriya },
+  { 460, PT_SC, ucp_Osmanya },
+  { 468, PT_GC, ucp_P },
+  { 470, PT_PC, ucp_Pc },
+  { 473, PT_PC, ucp_Pd },
+  { 476, PT_PC, ucp_Pe },
+  { 479, PT_PC, ucp_Pf },
+  { 482, PT_SC, ucp_Phags_Pa },
+  { 491, PT_SC, ucp_Phoenician },
+  { 502, PT_PC, ucp_Pi },
+  { 505, PT_PC, ucp_Po },
+  { 508, PT_PC, ucp_Ps },
+  { 511, PT_SC, ucp_Runic },
+  { 517, PT_GC, ucp_S },
+  { 519, PT_PC, ucp_Sc },
+  { 522, PT_SC, ucp_Shavian },
+  { 530, PT_SC, ucp_Sinhala },
+  { 538, PT_PC, ucp_Sk },
+  { 541, PT_PC, ucp_Sm },
+  { 544, PT_PC, ucp_So },
+  { 547, PT_SC, ucp_Syloti_Nagri },
+  { 560, PT_SC, ucp_Syriac },
+  { 567, PT_SC, ucp_Tagalog },
+  { 575, PT_SC, ucp_Tagbanwa },
+  { 584, PT_SC, ucp_Tai_Le },
+  { 591, PT_SC, ucp_Tamil },
+  { 597, PT_SC, ucp_Telugu },
+  { 604, PT_SC, ucp_Thaana },
+  { 611, PT_SC, ucp_Thai },
+  { 616, PT_SC, ucp_Tibetan },
+  { 624, PT_SC, ucp_Tifinagh },
+  { 633, PT_SC, ucp_Ugaritic },
+  { 642, PT_SC, ucp_Yi },
+  { 645, PT_GC, ucp_Z },
+  { 647, PT_PC, ucp_Zl },
+  { 650, PT_PC, ucp_Zp },
+  { 653, PT_PC, ucp_Zs }
 };
 
 const int _pcre_utt_size = sizeof(_pcre_utt)/sizeof(ucp_type_table);
 
+#endif  /* SUPPORT_UTF8 */
+
 /* End of pcre_tables.c */