${utf8clean:string} expansion operator. Bug 1401
authorAxel Rau <axel.rau@chaos1.de>
Sat, 8 Mar 2014 20:59:24 +0000 (20:59 +0000)
committerJeremy Harris <jgh146exb@wizmail.org>
Sat, 8 Mar 2014 21:01:55 +0000 (21:01 +0000)
doc/doc-docbook/spec.xfpt
doc/doc-txt/ChangeLog
doc/doc-txt/NewStuff
src/src/expand.c
test/confs/0600 [new file with mode: 0644]
test/log/0600 [new file with mode: 0644]
test/mail/0600.CALLER [new file with mode: 0644]
test/scripts/0000-Basic/0600 [new file with mode: 0644]
test/stdout/0600 [new file with mode: 0644]

index 4faf78d9040d9383ceae6bff65a334426fef8f38..e4cd6ed300cbffb6e111711ffc3b2bbdadbee978 100644 (file)
@@ -10171,6 +10171,14 @@ number of larger units and output in Exim's normal time format, for example,
 .cindex "expansion" "case forcing"
 .cindex "&%uc%& expansion item"
 This forces the letters in the string into upper-case.
 .cindex "expansion" "case forcing"
 .cindex "&%uc%& expansion item"
 This forces the letters in the string into upper-case.
+
+.vitem &*${utf8clean*&<&'utf-8 string'&>&*}*&
+.cindex "correction of invalid utf-8 sequences in strings"
+.cindex "utf-8" "utf-8 sequences"
+.cindex "wrong utf-8"
+.cindex "expansion" "utf-8 forcing"
+.cindex "&%utf8clean%& expansion item"
+This replaces any invalid utf-8 sequence in the string by the character &`?`&.
 .endlist
 
 
 .endlist
 
 
index 1ca558e34a99b30ec227c7ae3700d10eda1ee969..83c255c24279ba9a800444591bb88696a27010d2 100644 (file)
@@ -46,6 +46,8 @@ TL/05 Rename SPF condition results err_perm and err_temp to standardized
       the ACL tests for either of these two results. Patch contributed by
       user bes-internal on the mailing list.
 
       the ACL tests for either of these two results. Patch contributed by
       user bes-internal on the mailing list.
 
+JH/04 Add ${utf8clean:} operator. Contributed by Alex Rau.
+
 Exim version 4.82
 -----------------
 
 Exim version 4.82
 -----------------
 
index dd3e5871422670a5414b404c739cfb21eef6d52d..04ac831dc04926be04983e4358f18a210244dd10 100644 (file)
@@ -19,6 +19,9 @@ Version 4.83
     those non-ASCII characters, but downstream apps may not, so Exim can
     detect and reject if those characters are present.
 
     those non-ASCII characters, but downstream apps may not, so Exim can
     detect and reject if those characters are present.
 
+ 3. New expansion operator ${utf8clean:string} to replace malformed UTF8
+    codepoints with valid ones.
+
 
 Version 4.82
 ------------
 
 Version 4.82
 ------------
index 7e1b32343050182980b40062f3b155ed8e18b25a..ee7b2fdc68511ecb0b7ea6ed581f15cce2ccdfce 100644 (file)
@@ -204,7 +204,8 @@ static uschar *op_table_main[] = {
   US"str2b64",
   US"strlen",
   US"substr",
   US"str2b64",
   US"strlen",
   US"substr",
-  US"uc" };
+  US"uc",
+  US"utf8clean" };
 
 enum {
   EOP_ADDRESS =  sizeof(op_table_underscore)/sizeof(uschar *),
 
 enum {
   EOP_ADDRESS =  sizeof(op_table_underscore)/sizeof(uschar *),
@@ -240,7 +241,8 @@ enum {
   EOP_STR2B64,
   EOP_STRLEN,
   EOP_SUBSTR,
   EOP_STR2B64,
   EOP_STRLEN,
   EOP_SUBSTR,
-  EOP_UC };
+  EOP_UC,
+  EOP_UTF8CLEAN };
 
 
 /* Table of condition names, and corresponding switch numbers. The names must
 
 
 /* Table of condition names, and corresponding switch numbers. The names must
@@ -6206,6 +6208,89 @@ while (*s != 0)
         continue;
         }
 
         continue;
         }
 
+         /* replace illegal UTF-8 sequences by replacement character  */
+         
+      #define UTF8_REPLACEMENT_CHAR US"?"
+
+      case EOP_UTF8CLEAN:
+        {
+        int seq_len, index = 0;
+        int bytes_left  = 0;
+        uschar seq_buff[4];                    /* accumulate utf-8 here */
+        
+        while (*sub != 0)
+         {
+         int complete;
+         long codepoint;
+         uschar c;
+
+         complete = 0;
+         c = *sub++;
+         if(bytes_left)
+           {
+           if ((c & 0xc0) != 0x80)
+             {
+                   /* wrong continuation byte; invalidate all bytes */
+             complete = 1; /* error */
+             }
+           else
+             {
+             codepoint = (codepoint << 6) | (c & 0x3f);
+             seq_buff[index++] = c;
+             if (--bytes_left == 0)            /* codepoint complete */
+               {
+               if(codepoint > 0x10FFFF)        /* is it too large? */
+                 complete = -1;        /* error */
+               else
+                 {             /* finished; output utf-8 sequence */
+                 yield = string_cat(yield, &size, &ptr, seq_buff, seq_len);
+                 index = 0;
+                 }
+               }
+             }
+           }
+         else  /* no bytes left: new sequence */
+           {
+           if((c & 0x80) == 0) /* 1-byte sequence, US-ASCII, keep it */
+             {
+             yield = string_cat(yield, &size, &ptr, &c, 1);
+             continue;
+             }
+           if((c & 0xe0) == 0xc0)              /* 2-byte sequence */
+             {
+             bytes_left = 1;
+             codepoint = c & 0x1f;
+             }
+           else if((c & 0xf0) == 0xe0)         /* 3-byte sequence */
+             {
+             bytes_left = 2;
+             codepoint = c & 0x0f;
+             }
+           else if((c & 0xf8) == 0xf0)         /* 4-byte sequence */
+             {
+             bytes_left = 3;
+             codepoint = c & 0x07;
+             }
+           else        /* invalid or too long (RFC3629 allows only 4 bytes) */
+             complete = -1;
+
+           seq_buff[index++] = c;
+           seq_len = bytes_left + 1;
+           }           /* if(bytes_left) */
+
+         if (complete != 0)
+           {
+           bytes_left = index = 0;
+           yield = string_cat(yield, &size, &ptr, UTF8_REPLACEMENT_CHAR, 1);
+           }
+         if ((complete == 1) && ((c & 0x80) == 0))
+           { /* ASCII character follows incomplete sequence */
+             yield = string_cat(yield, &size, &ptr, &c, 1);
+           }
+         }
+        continue;
+        }
+
       /* escape turns all non-printing characters into escape sequences. */
 
       case EOP_ESCAPE:
       /* escape turns all non-printing characters into escape sequences. */
 
       case EOP_ESCAPE:
@@ -6834,4 +6919,7 @@ return 0;
 
 #endif
 
 
 #endif
 
+/*
+ vi: aw ai sw=2
+*/
 /* End of expand.c */
 /* End of expand.c */
diff --git a/test/confs/0600 b/test/confs/0600
new file mode 100644 (file)
index 0000000..0347e4c
--- /dev/null
@@ -0,0 +1,69 @@
+# Exim test configuration 0005
+
+exim_path = EXIM_PATH
+host_lookup_order = bydns
+rfc1413_query_timeout = 0s
+spool_directory = DIR/spool
+log_file_path = DIR/spool/log/%slog
+gecos_pattern = ""
+gecos_name = CALLER_NAME
+
+# ----- Main settings -----
+
+domainlist local_domains = @
+
+acl_smtp_rcpt = accept
+acl_smtp_data = check_data
+
+trusted_users = CALLER
+
+
+# ----- ACL -----
+
+begin acl
+
+check_data:
+  accept logwrite = \
+                       x-test-header-good1: ${utf8clean:$h_x-test-header-good1:}
+                logwrite = \
+                       x-test-header-good2: ${utf8clean:$h_x-test-header-good2:}
+                logwrite = \
+                       x-test-header-too-short: ${utf8clean:$h_x-test-header-too-short:}
+                logwrite = \
+                       x-test-header-too-long: ${utf8clean:$h_x-test-header-too-long:}
+                logwrite = \
+                       x-test-header-too-big: ${utf8clean:$h_x-test-header-too-big:}
+
+
+
+# ----- Routers -----
+
+begin routers
+
+fail_remote_domains:
+  driver = redirect
+  domains = ! +local_domains
+  data = :fail: unrouteable mail domain "$domain"
+
+localuser:
+  driver = accept
+  check_local_user
+  transport = local_delivery
+  headers_add = X-local-user: uid=$local_user_uid gid=$local_user_gid
+
+
+# ----- Transports -----
+
+begin transports
+
+local_delivery:
+  driver = appendfile
+  delivery_date_add
+  envelope_to_add
+  file = DIR/test-mail/$local_part
+  headers_add = "X-body-linecount: $body_linecount\n\
+                 X-message-linecount: $message_linecount\n\
+                 X-received-count: $received_count"
+  return_path_add
+
+# End
diff --git a/test/log/0600 b/test/log/0600
new file mode 100644 (file)
index 0000000..8fc8cfc
--- /dev/null
@@ -0,0 +1,18 @@
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-good1: 1234567890qwertzuiopasdfghjklyxcvbnm,.-QWERTZUIOP+*ASDFGHJKL#'YXCVBNM,.-;:_
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-good2: \303\237\303\274\303\266\303\244\342\202\254\303\234\303\226\303\204\302\264\340\244\221\340\244\225\340\244\234\341\220\201\341\221\214\341\221\225\360\253\235\206\360\253\237\230
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-too-short: ?.?.?.\303\244-?.-\303\234.?..?.-?.-?..-?.-?.-?.-?.-?..-?..?.
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-too-long: ?????-\303\244-?????--\303\226-\303\204-\302\264-\340\244\221-\340\244\225-\340\244\234-\341\220\201-\341\221\214-\341\221\225-?????\360\253\237\206
+1999-03-02 09:44:33 10HmaX-0005vi-00 x-test-header-too-big: ?-----\363\200\200\200
+1999-03-02 09:44:33 10HmaX-0005vi-00 <= CALLER@the.local.host.name U=CALLER P=local-smtp S=sss
+1999-03-02 09:44:33 10HmaX-0005vi-00 => CALLER <CALLER@the.local.host.name> R=localuser T=local_delivery
+1999-03-02 09:44:33 10HmaX-0005vi-00 Completed
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-good1: 
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-good2: 
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-too-short: 
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-too-long: 
+1999-03-02 09:44:33 10HmaY-0005vi-00 x-test-header-too-big: 
+1999-03-02 09:44:33 10HmaY-0005vi-00 <= CALLER@the.local.host.name U=CALLER P=local-smtp S=sss
+1999-03-02 09:44:33 10HmaY-0005vi-00 => CALLER <CALLER@the.local.host.name> R=localuser T=local_delivery
+1999-03-02 09:44:33 10HmaY-0005vi-00 Completed
+1999-03-02 09:44:33 Start queue run: pid=pppp
+1999-03-02 09:44:33 End queue run: pid=pppp
diff --git a/test/mail/0600.CALLER b/test/mail/0600.CALLER
new file mode 100644 (file)
index 0000000..e9a5005
--- /dev/null
@@ -0,0 +1,45 @@
+From CALLER@the.local.host.name Tue Mar 02 09:44:33 1999
+Return-path: <CALLER@the.local.host.name>
+Envelope-to: CALLER@the.local.host.name
+Delivery-date: Tue, 2 Mar 1999 09:44:33 +0000
+Received: from CALLER by the.local.host.name with local-smtp (Exim x.yz)
+       (envelope-from <CALLER@the.local.host.name>)
+       id 10HmaX-0005vi-00
+       for CALLER@the.local.host.name; Tue, 2 Mar 1999 09:44:33 +0000
+x-test-header-good1: 1234567890qwertzuiopasdfghjklyxcvbnm,.-QWERTZUIOP+*ASDFGHJKL#'YXCVBNM,.-;:_
+x-test-header-good2: ßüöä€ÜÖÄ´ऑकजᐁᑌᑕ𫝆𫟘
+x-test-header-too-short: Ã.Ã.Ã.ä-â\82.-Ã\9c.Ã..Ã.-Â.-à..-à¤.-à¤.-á\90.-á\91.-á..-ð«\9d..ð«\9f.
+x-test-header-too-long: ø\88\88\88\88-ä-ø\88\88\88\88--Ã\96\84-´-à¤\91-à¤\95-à¤\9c\90\81\91\8c\91\95\80\80\80\80ð«\9f\86
+x-test-header-too-big: -----󀀀
+Subject: This is a test message.
+Message-Id: <E10HmaX-0005vi-00@the.local.host.name>
+From: CALLER@the.local.host.name
+Date: Tue, 2 Mar 1999 09:44:33 +0000
+X-local-user: uid=CALLER_UID gid=CALLER_GID
+X-body-linecount: 3
+X-message-linecount: 16
+X-received-count: 1
+
+This is a test message.
+It has three lines.
+This is the last line.
+
+From CALLER@the.local.host.name Tue Mar 02 09:44:33 1999
+Return-path: <CALLER@the.local.host.name>
+Envelope-to: CALLER@the.local.host.name
+Delivery-date: Tue, 2 Mar 1999 09:44:33 +0000
+Received: from CALLER by the.local.host.name with local-smtp (Exim x.yz)
+       (envelope-from <CALLER@the.local.host.name>)
+       id 10HmaY-0005vi-00
+       for CALLER@the.local.host.name; Tue, 2 Mar 1999 09:44:33 +0000
+Subject: second
+Message-Id: <E10HmaY-0005vi-00@the.local.host.name>
+From: CALLER@the.local.host.name
+Date: Tue, 2 Mar 1999 09:44:33 +0000
+X-local-user: uid=CALLER_UID gid=CALLER_GID
+X-body-linecount: 1
+X-message-linecount: 9
+X-received-count: 1
+
+This is a second test message.
+
diff --git a/test/scripts/0000-Basic/0600 b/test/scripts/0000-Basic/0600
new file mode 100644 (file)
index 0000000..9d5e67b
--- /dev/null
@@ -0,0 +1,32 @@
+# ${utf8clean:string}
+#
+# -bs to simple local delivery
+exim -bs -odi
+mail from:CALLER@HOSTNAME
+rcpt to:CALLER@HOSTNAME
+data
+x-test-header-good1: 1234567890qwertzuiopasdfghjklyxcvbnm,.-QWERTZUIOP+*ASDFGHJKL#'YXCVBNM,.-;:_
+x-test-header-good2: ßüöä€ÜÖÄ´ऑकजᐁᑌᑕ𫝆𫟘
+x-test-header-too-short: Ã.Ã.Ã.ä-â\82.-Ã\9c.Ã..Ã.-Â.-à..-à¤.-à¤.-á\90.-á\91.-á..-ð«\9d..ð«\9f.
+x-test-header-too-long: ø\88\88\88\88-ä-ø\88\88\88\88--Ã\96\84-´-à¤\91-à¤\95-à¤\9c\90\81\91\8c\91\95\80\80\80\80ð«\9f\86
+x-test-header-too-big: -----󀀀
+Subject: This is a test message.
+
+This is a test message.
+It has three lines.
+This is the last line.
+.
+quit
+****
+exim -bs -odi
+mail from:CALLER@HOSTNAME
+rcpt to:CALLER@HOSTNAME
+data
+Subject: second
+
+This is a second test message.
+.
+quit
+****
+exim -q
+****
diff --git a/test/stdout/0600 b/test/stdout/0600
new file mode 100644 (file)
index 0000000..2b1941f
--- /dev/null
@@ -0,0 +1,12 @@
+220 the.local.host.name ESMTP Exim x.yz Tue, 2 Mar 1999 09:44:33 +0000\r
+250 OK\r
+250 Accepted\r
+354 Enter message, ending with "." on a line by itself\r
+250 OK id=10HmaX-0005vi-00\r
+221 the.local.host.name closing connection\r
+220 the.local.host.name ESMTP Exim x.yz Tue, 2 Mar 1999 09:44:33 +0000\r
+250 OK\r
+250 Accepted\r
+354 Enter message, ending with "." on a line by itself\r
+250 OK id=10HmaY-0005vi-00\r
+221 the.local.host.name closing connection\r