Expansions: disallow UTF-16 surrogates from ${utf8clean:...}. Bug 2998

author Jasen Betts <jasen@xnet.co.nz>

Sun, 23 Jul 2023 12:43:59 +0000 (13:43 +0100)

committer Jeremy Harris <jgh146exb@wizmail.org>

Sun, 23 Jul 2023 12:49:10 +0000 (13:49 +0100)
author Jasen Betts <jasen@xnet.co.nz>
Sun, 23 Jul 2023 12:43:59 +0000 (13:43 +0100)
committer Jeremy Harris <jgh146exb@wizmail.org>
Sun, 23 Jul 2023 12:49:10 +0000 (13:49 +0100)
diff --git a/doc/doc-txt/ChangeLog b/doc/doc-txt/ChangeLog

index a3b43b2f564ee9145a9e1d5f22d9605292031f5d..3d74d58b0fdca9138f065255ccdffb67de1ee393 100644 (file)
--- a/doc/doc-txt/ChangeLog
+++ b/doc/doc-txt/ChangeLog
@@ -163,6 +163,10 @@ JH/30 Bug 3006: Fix handling of JSON strings having embedded commas. Previously
        need to be protected by the doublequotes.  While there, add handling for
        backslashes.
  
+JH/31 Bug 2998: Fix ${utf8clean:...} to disallow UTF-16 surrogate codepoints.
+      Found and fixed by Jasen Betts. No testcase for this as my usual text
+      editor insists on emitting only valid UTF-8.
+
  Exim version 4.96
  -----------------
  
diff --git a/src/src/expand.c b/src/src/expand.c

index fea6501fe881d140aebf5dc096394701ef5ca5ef..d8ea7ae6be4d0e4c7ea14fb25b2917225942fe6b 100644 (file)
--- a/src/src/expand.c
+++ b/src/src/expand.c
@@ -7862,7 +7862,7 @@ NOT_ITEM: ;
         case EOP_UTF8CLEAN:
           {
           int seq_len = 0, index = 0, bytes_left = 0, complete;
-         long codepoint = -1;
+         ulong codepoint = (ulong)-1;
           uschar seq_buff[4];                   /* accumulate utf-8 here */
  
           /* Manually track tainting, as we deal in individual chars below */
@@ -7896,6 +7896,15 @@ NOT_ITEM: ;
                 if (--bytes_left == 0)          /* codepoint complete */
                   if(codepoint > 0x10FFFF)      /* is it too large? */
                     complete = -1;      /* error (RFC3629 limit) */
+                 else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */
+                   /* A UTF-16 surrogate (which should be one of a pair that
+                   encode a Unicode codepoint that is outside the Basic
+                   Multilingual Plane).  Error, not UTF8.
+                   RFC2279.2 is slightly unclear on this, but 
+                   https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder
+                   says "Surrogates characters are also invalid in UTF-8:
+                   characters in U+D800—U+DFFF have to be rejected." */
+                   complete = -1;
                   else
                     {           /* finished; output utf-8 sequence */
                     yield = string_catn(yield, seq_buff, seq_len);
@@ -7905,27 +7914,25 @@ NOT_ITEM: ;
               }
             else        /* no bytes left: new sequence */
               {
-             if(!(c & 0x80))   /* 1-byte sequence, US-ASCII, keep it */
+             if (!(c & 0x80))  /* 1-byte sequence, US-ASCII, keep it */
                 {
                 yield = string_catn(yield, &c, 1);
                 continue;
                 }
-             if((c & 0xe0) == 0xc0)            /* 2-byte sequence */
-               {
-               if(c == 0xc0 || c == 0xc1)      /* 0xc0 and 0xc1 are illegal */
+             if ((c & 0xe0) == 0xc0)           /* 2-byte sequence */
+               if (c == 0xc0 || c == 0xc1)     /* 0xc0 and 0xc1 are illegal */
                   complete = -1;
                 else
                   {
-                   bytes_left = 1;
-                   codepoint = c & 0x1f;
+                 bytes_left = 1;
+                 codepoint = c & 0x1f;
                   }
-               }
-             else if((c & 0xf0) == 0xe0)               /* 3-byte sequence */
+             else if ((c & 0xf0) == 0xe0)              /* 3-byte sequence */
                 {
                 bytes_left = 2;
                 codepoint = c & 0x0f;
                 }
-             else if((c & 0xf8) == 0xf0)               /* 4-byte sequence */
+             else if ((c & 0xf8) == 0xf0)              /* 4-byte sequence */
                 {
                 bytes_left = 3;
                 codepoint = c & 0x07;
author	Jasen Betts <jasen@xnet.co.nz>
	Sun, 23 Jul 2023 12:43:59 +0000 (13:43 +0100)
committer	Jeremy Harris <jgh146exb@wizmail.org>
	Sun, 23 Jul 2023 12:49:10 +0000 (13:49 +0100)
doc/doc-txt/ChangeLog		patch \| blob \| history
src/src/expand.c		patch \| blob \| history