From: Jasen Betts <jasen@xnet.co.nz>
Date: Sun, 23 Jul 2023 12:43:59 +0000 (+0100)
Subject: Expansions: disallow UTF-16 surrogates from ${utf8clean:...}.  Bug 2998
X-Git-Tag: exim-4.97-RC0~50
X-Git-Url: https://git.exim.org/exim.git/commitdiff_plain/1209e3e19e292cee517e43a2ccfe9b44b33bb1dc?hp=66ce3fc9291d13fe8a7d4099942b9101aef1c38c

Expansions: disallow UTF-16 surrogates from ${utf8clean:...}.  Bug 2998
---

diff --git a/doc/doc-txt/ChangeLog b/doc/doc-txt/ChangeLog
index a3b43b2f5..3d74d58b0 100644
--- a/doc/doc-txt/ChangeLog
+++ b/doc/doc-txt/ChangeLog
@@ -163,6 +163,10 @@ JH/30 Bug 3006: Fix handling of JSON strings having embedded commas. Previously
       need to be protected by the doublequotes.  While there, add handling for
       backslashes.
 
+JH/31 Bug 2998: Fix ${utf8clean:...} to disallow UTF-16 surrogate codepoints.
+      Found and fixed by Jasen Betts. No testcase for this as my usual text
+      editor insists on emitting only valid UTF-8.
+
 Exim version 4.96
 -----------------
 
diff --git a/src/src/expand.c b/src/src/expand.c
index fea6501fe..d8ea7ae6b 100644
--- a/src/src/expand.c
+++ b/src/src/expand.c
@@ -7862,7 +7862,7 @@ NOT_ITEM: ;
 	case EOP_UTF8CLEAN:
 	  {
 	  int seq_len = 0, index = 0, bytes_left = 0, complete;
-	  long codepoint = -1;
+	  ulong codepoint = (ulong)-1;
 	  uschar seq_buff[4];			/* accumulate utf-8 here */
 
 	  /* Manually track tainting, as we deal in individual chars below */
@@ -7896,6 +7896,15 @@ NOT_ITEM: ;
 		if (--bytes_left == 0)		/* codepoint complete */
 		  if(codepoint > 0x10FFFF)	/* is it too large? */
 		    complete = -1;	/* error (RFC3629 limit) */
+		  else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */
+		    /* A UTF-16 surrogate (which should be one of a pair that
+		    encode a Unicode codepoint that is outside the Basic
+		    Multilingual Plane).  Error, not UTF8.
+		    RFC2279.2 is slightly unclear on this, but 
+		    https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder
+		    says "Surrogates characters are also invalid in UTF-8:
+		    characters in U+D800âU+DFFF have to be rejected." */
+		    complete = -1;
 		  else
 		    {		/* finished; output utf-8 sequence */
 		    yield = string_catn(yield, seq_buff, seq_len);
@@ -7905,27 +7914,25 @@ NOT_ITEM: ;
 	      }
 	    else	/* no bytes left: new sequence */
 	      {
-	      if(!(c & 0x80))	/* 1-byte sequence, US-ASCII, keep it */
+	      if (!(c & 0x80))	/* 1-byte sequence, US-ASCII, keep it */
 		{
 		yield = string_catn(yield, &c, 1);
 		continue;
 		}
-	      if((c & 0xe0) == 0xc0)		/* 2-byte sequence */
-		{
-		if(c == 0xc0 || c == 0xc1)	/* 0xc0 and 0xc1 are illegal */
+	      if ((c & 0xe0) == 0xc0)		/* 2-byte sequence */
+		if (c == 0xc0 || c == 0xc1)	/* 0xc0 and 0xc1 are illegal */
 		  complete = -1;
 		else
 		  {
-		    bytes_left = 1;
-		    codepoint = c & 0x1f;
+		  bytes_left = 1;
+		  codepoint = c & 0x1f;
 		  }
-		}
-	      else if((c & 0xf0) == 0xe0)		/* 3-byte sequence */
+	      else if ((c & 0xf0) == 0xe0)		/* 3-byte sequence */
 		{
 		bytes_left = 2;
 		codepoint = c & 0x0f;
 		}
-	      else if((c & 0xf8) == 0xf0)		/* 4-byte sequence */
+	      else if ((c & 0xf8) == 0xf0)		/* 4-byte sequence */
 		{
 		bytes_left = 3;
 		codepoint = c & 0x07;