From: Jasen Betts Date: Sun, 23 Jul 2023 12:43:59 +0000 (+0100) Subject: Expansions: disallow UTF-16 surrogates from ${utf8clean:...}. Bug 2998 X-Git-Tag: exim-4.97-RC0~50 X-Git-Url: https://git.exim.org/exim.git/commitdiff_plain/1209e3e19e292cee517e43a2ccfe9b44b33bb1dc?hp=66ce3fc9291d13fe8a7d4099942b9101aef1c38c Expansions: disallow UTF-16 surrogates from ${utf8clean:...}. Bug 2998 --- diff --git a/doc/doc-txt/ChangeLog b/doc/doc-txt/ChangeLog index a3b43b2f5..3d74d58b0 100644 --- a/doc/doc-txt/ChangeLog +++ b/doc/doc-txt/ChangeLog @@ -163,6 +163,10 @@ JH/30 Bug 3006: Fix handling of JSON strings having embedded commas. Previously need to be protected by the doublequotes. While there, add handling for backslashes. +JH/31 Bug 2998: Fix ${utf8clean:...} to disallow UTF-16 surrogate codepoints. + Found and fixed by Jasen Betts. No testcase for this as my usual text + editor insists on emitting only valid UTF-8. + Exim version 4.96 ----------------- diff --git a/src/src/expand.c b/src/src/expand.c index fea6501fe..d8ea7ae6b 100644 --- a/src/src/expand.c +++ b/src/src/expand.c @@ -7862,7 +7862,7 @@ NOT_ITEM: ; case EOP_UTF8CLEAN: { int seq_len = 0, index = 0, bytes_left = 0, complete; - long codepoint = -1; + ulong codepoint = (ulong)-1; uschar seq_buff[4]; /* accumulate utf-8 here */ /* Manually track tainting, as we deal in individual chars below */ @@ -7896,6 +7896,15 @@ NOT_ITEM: ; if (--bytes_left == 0) /* codepoint complete */ if(codepoint > 0x10FFFF) /* is it too large? */ complete = -1; /* error (RFC3629 limit) */ + else if ( (codepoint & 0x1FF800 ) == 0xD800 ) /* surrogate */ + /* A UTF-16 surrogate (which should be one of a pair that + encode a Unicode codepoint that is outside the Basic + Multilingual Plane). Error, not UTF8. + RFC2279.2 is slightly unclear on this, but + https://unicodebook.readthedocs.io/issues.html#strict-utf8-decoder + says "Surrogates characters are also invalid in UTF-8: + characters in U+D800—U+DFFF have to be rejected." */ + complete = -1; else { /* finished; output utf-8 sequence */ yield = string_catn(yield, seq_buff, seq_len); @@ -7905,27 +7914,25 @@ NOT_ITEM: ; } else /* no bytes left: new sequence */ { - if(!(c & 0x80)) /* 1-byte sequence, US-ASCII, keep it */ + if (!(c & 0x80)) /* 1-byte sequence, US-ASCII, keep it */ { yield = string_catn(yield, &c, 1); continue; } - if((c & 0xe0) == 0xc0) /* 2-byte sequence */ - { - if(c == 0xc0 || c == 0xc1) /* 0xc0 and 0xc1 are illegal */ + if ((c & 0xe0) == 0xc0) /* 2-byte sequence */ + if (c == 0xc0 || c == 0xc1) /* 0xc0 and 0xc1 are illegal */ complete = -1; else { - bytes_left = 1; - codepoint = c & 0x1f; + bytes_left = 1; + codepoint = c & 0x1f; } - } - else if((c & 0xf0) == 0xe0) /* 3-byte sequence */ + else if ((c & 0xf0) == 0xe0) /* 3-byte sequence */ { bytes_left = 2; codepoint = c & 0x0f; } - else if((c & 0xf8) == 0xf0) /* 4-byte sequence */ + else if ((c & 0xf8) == 0xf0) /* 4-byte sequence */ { bytes_left = 3; codepoint = c & 0x07;