+ /* replace illegal UTF-8 sequences by replacement character */
+
+ #define UTF8_REPLACEMENT_CHAR US"?"
+
+ case EOP_UTF8CLEAN:
+ {
+ int seq_len, index = 0;
+ int bytes_left = 0;
+ uschar seq_buff[4]; /* accumulate utf-8 here */
+
+ while (*sub != 0)
+ {
+ int complete;
+ long codepoint;
+ uschar c;
+
+ complete = 0;
+ c = *sub++;
+ if(bytes_left)
+ {
+ if ((c & 0xc0) != 0x80)
+ {
+ /* wrong continuation byte; invalidate all bytes */
+ complete = 1; /* error */
+ }
+ else
+ {
+ codepoint = (codepoint << 6) | (c & 0x3f);
+ seq_buff[index++] = c;
+ if (--bytes_left == 0) /* codepoint complete */
+ {
+ if(codepoint > 0x10FFFF) /* is it too large? */
+ complete = -1; /* error */
+ else
+ { /* finished; output utf-8 sequence */
+ yield = string_cat(yield, &size, &ptr, seq_buff, seq_len);
+ index = 0;
+ }
+ }
+ }
+ }
+ else /* no bytes left: new sequence */
+ {
+ if((c & 0x80) == 0) /* 1-byte sequence, US-ASCII, keep it */
+ {
+ yield = string_cat(yield, &size, &ptr, &c, 1);
+ continue;
+ }
+ if((c & 0xe0) == 0xc0) /* 2-byte sequence */
+ {
+ bytes_left = 1;
+ codepoint = c & 0x1f;
+ }
+ else if((c & 0xf0) == 0xe0) /* 3-byte sequence */
+ {
+ bytes_left = 2;
+ codepoint = c & 0x0f;
+ }
+ else if((c & 0xf8) == 0xf0) /* 4-byte sequence */
+ {
+ bytes_left = 3;
+ codepoint = c & 0x07;
+ }
+ else /* invalid or too long (RFC3629 allows only 4 bytes) */
+ complete = -1;
+
+ seq_buff[index++] = c;
+ seq_len = bytes_left + 1;
+ } /* if(bytes_left) */
+
+ if (complete != 0)
+ {
+ bytes_left = index = 0;
+ yield = string_cat(yield, &size, &ptr, UTF8_REPLACEMENT_CHAR, 1);
+ }
+ if ((complete == 1) && ((c & 0x80) == 0))
+ { /* ASCII character follows incomplete sequence */
+ yield = string_cat(yield, &size, &ptr, &c, 1);
+ }
+ }
+ continue;
+ }
+