From b9c2e32fc0c80d2ff4afb7daa782d22c5c38a229 Mon Sep 17 00:00:00 2001 From: Axel Rau Date: Sat, 8 Mar 2014 20:59:24 +0000 Subject: [PATCH] ${utf8clean:string} expansion operator. Bug 1401 --- doc/doc-docbook/spec.xfpt | 8 ++++ doc/doc-txt/ChangeLog | 2 + doc/doc-txt/NewStuff | 3 ++ src/src/expand.c | 92 +++++++++++++++++++++++++++++++++++- test/confs/0600 | 69 +++++++++++++++++++++++++++ test/log/0600 | 18 +++++++ test/mail/0600.CALLER | 45 ++++++++++++++++++ test/scripts/0000-Basic/0600 | 32 +++++++++++++ test/stdout/0600 | 12 +++++ 9 files changed, 279 insertions(+), 2 deletions(-) create mode 100644 test/confs/0600 create mode 100644 test/log/0600 create mode 100644 test/mail/0600.CALLER create mode 100644 test/scripts/0000-Basic/0600 create mode 100644 test/stdout/0600 diff --git a/doc/doc-docbook/spec.xfpt b/doc/doc-docbook/spec.xfpt index 4faf78d90..e4cd6ed30 100644 --- a/doc/doc-docbook/spec.xfpt +++ b/doc/doc-docbook/spec.xfpt @@ -10171,6 +10171,14 @@ number of larger units and output in Exim's normal time format, for example, .cindex "expansion" "case forcing" .cindex "&%uc%& expansion item" This forces the letters in the string into upper-case. + +.vitem &*${utf8clean*&<&'utf-8 string'&>&*}*& +.cindex "correction of invalid utf-8 sequences in strings" +.cindex "utf-8" "utf-8 sequences" +.cindex "wrong utf-8" +.cindex "expansion" "utf-8 forcing" +.cindex "&%utf8clean%& expansion item" +This replaces any invalid utf-8 sequence in the string by the character &`?`&. .endlist diff --git a/doc/doc-txt/ChangeLog b/doc/doc-txt/ChangeLog index 1ca558e34..83c255c24 100644 --- a/doc/doc-txt/ChangeLog +++ b/doc/doc-txt/ChangeLog @@ -46,6 +46,8 @@ TL/05 Rename SPF condition results err_perm and err_temp to standardized the ACL tests for either of these two results. Patch contributed by user bes-internal on the mailing list. +JH/04 Add ${utf8clean:} operator. Contributed by Alex Rau. + Exim version 4.82 ----------------- diff --git a/doc/doc-txt/NewStuff b/doc/doc-txt/NewStuff index dd3e58714..04ac831dc 100644 --- a/doc/doc-txt/NewStuff +++ b/doc/doc-txt/NewStuff @@ -19,6 +19,9 @@ Version 4.83 those non-ASCII characters, but downstream apps may not, so Exim can detect and reject if those characters are present. + 3. New expansion operator ${utf8clean:string} to replace malformed UTF8 + codepoints with valid ones. + Version 4.82 ------------ diff --git a/src/src/expand.c b/src/src/expand.c index 7e1b32343..ee7b2fdc6 100644 --- a/src/src/expand.c +++ b/src/src/expand.c @@ -204,7 +204,8 @@ static uschar *op_table_main[] = { US"str2b64", US"strlen", US"substr", - US"uc" }; + US"uc", + US"utf8clean" }; enum { EOP_ADDRESS = sizeof(op_table_underscore)/sizeof(uschar *), @@ -240,7 +241,8 @@ enum { EOP_STR2B64, EOP_STRLEN, EOP_SUBSTR, - EOP_UC }; + EOP_UC, + EOP_UTF8CLEAN }; /* Table of condition names, and corresponding switch numbers. The names must @@ -6206,6 +6208,89 @@ while (*s != 0) continue; } + /* replace illegal UTF-8 sequences by replacement character */ + + #define UTF8_REPLACEMENT_CHAR US"?" + + case EOP_UTF8CLEAN: + { + int seq_len, index = 0; + int bytes_left = 0; + uschar seq_buff[4]; /* accumulate utf-8 here */ + + while (*sub != 0) + { + int complete; + long codepoint; + uschar c; + + complete = 0; + c = *sub++; + if(bytes_left) + { + if ((c & 0xc0) != 0x80) + { + /* wrong continuation byte; invalidate all bytes */ + complete = 1; /* error */ + } + else + { + codepoint = (codepoint << 6) | (c & 0x3f); + seq_buff[index++] = c; + if (--bytes_left == 0) /* codepoint complete */ + { + if(codepoint > 0x10FFFF) /* is it too large? */ + complete = -1; /* error */ + else + { /* finished; output utf-8 sequence */ + yield = string_cat(yield, &size, &ptr, seq_buff, seq_len); + index = 0; + } + } + } + } + else /* no bytes left: new sequence */ + { + if((c & 0x80) == 0) /* 1-byte sequence, US-ASCII, keep it */ + { + yield = string_cat(yield, &size, &ptr, &c, 1); + continue; + } + if((c & 0xe0) == 0xc0) /* 2-byte sequence */ + { + bytes_left = 1; + codepoint = c & 0x1f; + } + else if((c & 0xf0) == 0xe0) /* 3-byte sequence */ + { + bytes_left = 2; + codepoint = c & 0x0f; + } + else if((c & 0xf8) == 0xf0) /* 4-byte sequence */ + { + bytes_left = 3; + codepoint = c & 0x07; + } + else /* invalid or too long (RFC3629 allows only 4 bytes) */ + complete = -1; + + seq_buff[index++] = c; + seq_len = bytes_left + 1; + } /* if(bytes_left) */ + + if (complete != 0) + { + bytes_left = index = 0; + yield = string_cat(yield, &size, &ptr, UTF8_REPLACEMENT_CHAR, 1); + } + if ((complete == 1) && ((c & 0x80) == 0)) + { /* ASCII character follows incomplete sequence */ + yield = string_cat(yield, &size, &ptr, &c, 1); + } + } + continue; + } + /* escape turns all non-printing characters into escape sequences. */ case EOP_ESCAPE: @@ -6834,4 +6919,7 @@ return 0; #endif +/* + vi: aw ai sw=2 +*/ /* End of expand.c */ diff --git a/test/confs/0600 b/test/confs/0600 new file mode 100644 index 000000000..0347e4c60 --- /dev/null +++ b/test/confs/0600 @@ -0,0 +1,69 @@ +# Exim test configuration 0005 + +exim_path = EXIM_PATH +host_lookup_order = bydns +rfc1413_query_timeout = 0s +spool_directory = DIR/spool +log_file_path = DIR/spool/log/%slog +gecos_pattern = "" +gecos_name = CALLER_NAME + +# ----- Main settings ----- + +domainlist local_domains = @ + +acl_smtp_rcpt = accept +acl_smtp_data = check_data + +trusted_users = CALLER + + +# ----- ACL ----- + +begin acl + +check_data: + accept logwrite = \ + x-test-header-good1: ${utf8clean:$h_x-test-header-good1:} + logwrite = \ + x-test-header-good2: ${utf8clean:$h_x-test-header-good2:} + logwrite = \ + x-test-header-too-short: ${utf8clean:$h_x-test-header-too-short:} + logwrite = \ + x-test-header-too-long: ${utf8clean:$h_x-test-header-too-long:} + logwrite = \ + x-test-header-too-big: ${utf8clean:$h_x-test-header-too-big:} + + + +# ----- Routers ----- + +begin routers + +fail_remote_domains: + driver = redirect + domains = ! +local_domains + data = :fail: unrouteable mail domain "$domain" + +localuser: + driver = accept + check_local_user + transport = local_delivery + headers_add = X-local-user: uid=$local_user_uid gid=$local_user_gid + + +# ----- Transports ----- + +begin transports + +local_delivery: + driver = appendfile + delivery_date_add + envelope_to_add + file = DIR/test-mail/$local_part + headers_add = "X-body-linecount: $body_linecount\n\ + X-message-linecount: $message_linecount\n\ + X-received-count: $received_count" + return_path_add + +# End diff --git a/test/log/0600 b/test/log/0600 new file mode 100644 index 000000000..8fc8cfc36 --- /dev/null +++ b/test/log/0600 @@ -0,0 +1,18 @@ +1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-good1: 1234567890qwertzuiopasdfghjklyxcvbnm,.-QWERTZUIOP+*ASDFGHJKL#'YXCVBNM,.-;:_ +1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-good2: \303\237\303\274\303\266\303\244\342\202\254\303\234\303\226\303\204\302\264\340\244\221\340\244\225\340\244\234\341\220\201\341\221\214\341\221\225\360\253\235\206\360\253\237\230 +1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-too-short: ?.?.?.\303\244-?.-\303\234.?..?.-?.-?..-?.-?.-?.-?.-?..-?..?. +1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-too-long: ?????-\303\244-?????--\303\226-\303\204-\302\264-\340\244\221-\340\244\225-\340\244\234-\341\220\201-\341\221\214-\341\221\225-?????\360\253\237\206 +1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-too-big: ?-----\363\200\200\200 +1999-03-02 09:44:33 10HmaX-0005vi-00 <= CALLER@the.local.host.name U=CALLER P=local-smtp S=sss +1999-03-02 09:44:33 10HmaX-0005vi-00 => CALLER R=localuser T=local_delivery +1999-03-02 09:44:33 10HmaX-0005vi-00 Completed +1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-good1: +1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-good2: +1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-too-short: +1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-too-long: +1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-too-big: +1999-03-02 09:44:33 10HmaY-0005vi-00 <= CALLER@the.local.host.name U=CALLER P=local-smtp S=sss +1999-03-02 09:44:33 10HmaY-0005vi-00 => CALLER R=localuser T=local_delivery +1999-03-02 09:44:33 10HmaY-0005vi-00 Completed +1999-03-02 09:44:33 Start queue run: pid=pppp +1999-03-02 09:44:33 End queue run: pid=pppp diff --git a/test/mail/0600.CALLER b/test/mail/0600.CALLER new file mode 100644 index 000000000..e9a50054e --- /dev/null +++ b/test/mail/0600.CALLER @@ -0,0 +1,45 @@ +From CALLER@the.local.host.name Tue Mar 02 09:44:33 1999 +Return-path: +Envelope-to: CALLER@the.local.host.name +Delivery-date: Tue, 2 Mar 1999 09:44:33 +0000 +Received: from CALLER by the.local.host.name with local-smtp (Exim x.yz) + (envelope-from ) + id 10HmaX-0005vi-00 + for CALLER@the.local.host.name; Tue, 2 Mar 1999 09:44:33 +0000 +x-test-header-good1: 1234567890qwertzuiopasdfghjklyxcvbnm,.-QWERTZUIOP+*ASDFGHJKL#'YXCVBNM,.-;:_ +x-test-header-good2: ßüöä€ÜÖÄ´ऑकजᐁᑌᑕ𫝆𫟘 +x-test-header-too-short: Ã.Ã.Ã.ä-â‚.-Ü.Ã..Ã.-Â.-à..-à¤.-à¤.-á.-á‘.-á..-ð«..ð«Ÿ. +x-test-header-too-long: øˆˆˆˆ-ä-øˆˆˆˆ--Ö-Ä-´-ऑ-क-ज-ᐁ-ᑌ-ᑕ-ø€€€€ð«Ÿ† +x-test-header-too-big: ÷€€€-----󀀀 +Subject: This is a test message. +Message-Id: +From: CALLER@the.local.host.name +Date: Tue, 2 Mar 1999 09:44:33 +0000 +X-local-user: uid=CALLER_UID gid=CALLER_GID +X-body-linecount: 3 +X-message-linecount: 16 +X-received-count: 1 + +This is a test message. +It has three lines. +This is the last line. + +From CALLER@the.local.host.name Tue Mar 02 09:44:33 1999 +Return-path: +Envelope-to: CALLER@the.local.host.name +Delivery-date: Tue, 2 Mar 1999 09:44:33 +0000 +Received: from CALLER by the.local.host.name with local-smtp (Exim x.yz) + (envelope-from ) + id 10HmaY-0005vi-00 + for CALLER@the.local.host.name; Tue, 2 Mar 1999 09:44:33 +0000 +Subject: second +Message-Id: +From: CALLER@the.local.host.name +Date: Tue, 2 Mar 1999 09:44:33 +0000 +X-local-user: uid=CALLER_UID gid=CALLER_GID +X-body-linecount: 1 +X-message-linecount: 9 +X-received-count: 1 + +This is a second test message. + diff --git a/test/scripts/0000-Basic/0600 b/test/scripts/0000-Basic/0600 new file mode 100644 index 000000000..9d5e67b5b --- /dev/null +++ b/test/scripts/0000-Basic/0600 @@ -0,0 +1,32 @@ +# ${utf8clean:string} +# +# -bs to simple local delivery +exim -bs -odi +mail from:CALLER@HOSTNAME +rcpt to:CALLER@HOSTNAME +data +x-test-header-good1: 1234567890qwertzuiopasdfghjklyxcvbnm,.-QWERTZUIOP+*ASDFGHJKL#'YXCVBNM,.-;:_ +x-test-header-good2: ßüöä€ÜÖÄ´ऑकजᐁᑌᑕ𫝆𫟘 +x-test-header-too-short: Ã.Ã.Ã.ä-â‚.-Ü.Ã..Ã.-Â.-à..-à¤.-à¤.-á.-á‘.-á..-ð«..ð«Ÿ. +x-test-header-too-long: øˆˆˆˆ-ä-øˆˆˆˆ--Ö-Ä-´-ऑ-क-ज-ᐁ-ᑌ-ᑕ-ø€€€€ð«Ÿ† +x-test-header-too-big: ÷€€€-----󀀀 +Subject: This is a test message. + +This is a test message. +It has three lines. +This is the last line. +. +quit +**** +exim -bs -odi +mail from:CALLER@HOSTNAME +rcpt to:CALLER@HOSTNAME +data +Subject: second + +This is a second test message. +. +quit +**** +exim -q +**** diff --git a/test/stdout/0600 b/test/stdout/0600 new file mode 100644 index 000000000..2b1941f58 --- /dev/null +++ b/test/stdout/0600 @@ -0,0 +1,12 @@ +220 the.local.host.name ESMTP Exim x.yz Tue, 2 Mar 1999 09:44:33 +0000 +250 OK +250 Accepted +354 Enter message, ending with "." on a line by itself +250 OK id=10HmaX-0005vi-00 +221 the.local.host.name closing connection +220 the.local.host.name ESMTP Exim x.yz Tue, 2 Mar 1999 09:44:33 +0000 +250 OK +250 Accepted +354 Enter message, ending with "." on a line by itself +250 OK id=10HmaY-0005vi-00 +221 the.local.host.name closing connection -- 2.30.2