Expansions: disallow UTF-16 surrogates from ${utf8clean:...}. Bug 2998
authorJasen Betts <jasen@xnet.co.nz>
Sun, 23 Jul 2023 12:43:59 +0000 (13:43 +0100)
committerJeremy Harris <jgh146exb@wizmail.org>
Sun, 23 Jul 2023 12:49:10 +0000 (13:49 +0100)
doc/doc-txt/ChangeLog
src/src/expand.c

index a3b43b2f564ee9145a9e1d5f22d9605292031f5d..3d74d58b0fdca9138f065255ccdffb67de1ee393 100644 (file)
@@ -163,6 +163,10 @@ JH/30 Bug 3006: Fix handling of JSON strings having embedded commas. Previously
       need to be protected by the doublequotes.  While there, add handling for
       backslashes.
 
+JH/31 Bug 2998: Fix ${utf8clean:...} to disallow UTF-16 surrogate codepoints.
+      Found and fixed by Jasen Betts. No testcase for this as my usual text
+      editor insists on emitting only valid UTF-8.
+
 Exim version 4.96
 -----------------
 
index fea6501fe881d140aebf5dc096394701ef5ca5ef..d8ea7ae6be4d0e4c7ea14fb25b2917225942fe6b 100644 (file)
@@ -7862,7 +7862,7 @@ NOT_ITEM: ;
        case EOP_UTF8CLEAN:
          {
          int seq_len = 0, index = 0, bytes_left = 0, complete;
-         long codepoint = -1;
+         ulong codepoint = (ulong)-1;
          uschar seq_buff[4];                   /* accumulate utf-8 here */
 
          /* Manually track tainting, as we deal in individual chars below */
@@ -7896,6 +7896,15 @@ NOT_ITEM: ;
                if (--bytes_left == 0)          /* codepoint complete */
                  if(codepoint > 0x10FFFF)      /* is it too large? */
                    complete = -1;      /* error (RFC3629 limit) */
+                 else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */
+                   /* A UTF-16 surrogate (which should be one of a pair that
+                   encode a Unicode codepoint that is outside the Basic
+                   Multilingual Plane).  Error, not UTF8.
+                   RFC2279.2 is slightly unclear on this, but 
+                   https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder
+                   says "Surrogates characters are also invalid in UTF-8:
+                   characters in U+D800—U+DFFF have to be rejected." */
+                   complete = -1;
                  else
                    {           /* finished; output utf-8 sequence */
                    yield = string_catn(yield, seq_buff, seq_len);
@@ -7905,27 +7914,25 @@ NOT_ITEM: ;
              }
            else        /* no bytes left: new sequence */
              {
-             if(!(c & 0x80))   /* 1-byte sequence, US-ASCII, keep it */
+             if (!(c & 0x80))  /* 1-byte sequence, US-ASCII, keep it */
                {
                yield = string_catn(yield, &c, 1);
                continue;
                }
-             if((c & 0xe0) == 0xc0)            /* 2-byte sequence */
-               {
-               if(c == 0xc0 || c == 0xc1)      /* 0xc0 and 0xc1 are illegal */
+             if ((c & 0xe0) == 0xc0)           /* 2-byte sequence */
+               if (c == 0xc0 || c == 0xc1)     /* 0xc0 and 0xc1 are illegal */
                  complete = -1;
                else
                  {
-                   bytes_left = 1;
-                   codepoint = c & 0x1f;
+                 bytes_left = 1;
+                 codepoint = c & 0x1f;
                  }
-               }
-             else if((c & 0xf0) == 0xe0)               /* 3-byte sequence */
+             else if ((c & 0xf0) == 0xe0)              /* 3-byte sequence */
                {
                bytes_left = 2;
                codepoint = c & 0x0f;
                }
-             else if((c & 0xf8) == 0xf0)               /* 4-byte sequence */
+             else if ((c & 0xf8) == 0xf0)              /* 4-byte sequence */
                {
                bytes_left = 3;
                codepoint = c & 0x07;