1 /*************************************************
2 * Exim - an Internet mail transport agent *
3 *************************************************/
5 /* Copyright (c) The Exim Maintainers 2020 - 2022 */
6 /* Copyright (c) University of Cambridge 1995 - 2018 */
7 /* See the file NOTICE for conditions of use and distribution. */
8 /* SPDX-License-Identifier: GPL-2.0-or-later */
10 /* This file contains a function for decoding message header lines that may
11 contain encoded "words" according to the rules described in
13 RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
15 The function is a rewritten version of code created by Norihisa Washitake.
16 The original could be used both inside Exim (as part of a patch) or in a
17 freestanding form. The original contained some built-in code conversions; I
18 have chosen only to do code conversions if iconv() is supported by the OS.
19 Because there were quite a lot of hacks to be done, for a variety of reasons,
22 You can find the latest version of the original library at
24 http://washitake.com/mail/exim/mime/
26 The code below is almost completely unlike the original. */
32 /*************************************************
33 * Do a QP conversion *
34 *************************************************/
36 /* This function decodes "quoted printable" into bytes.
39 string the string that includes QP escapes
40 ptrptr where to return pointer to the decoded string
42 Returns: the length of the decoded string, or -1 on failure
46 rfc2047_qpdecode(uschar *string, uschar **ptrptr)
51 ptr = *ptrptr = store_get(Ustrlen(string) + 1, string); /* No longer than this */
57 if (ch == '_') *ptr++ = ' ';
61 int b = (a == 0)? 0 : string[1];
62 if (!isxdigit(a) || !isxdigit(b)) return -1; /* Bad QP string */
63 *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
64 Ustrchr(hex_digits, tolower(b)) - hex_digits;
67 else if (ch == ' ' || ch == '\t') return -1; /* Whitespace is illegal */
79 /*************************************************
80 * Decode next MIME word *
81 *************************************************/
83 /* Scan a string to see if a MIME word exists; pass back the separator
88 lencheck TRUE to enforce maximum length check
89 q1ptr pass back address of first question mark
90 q2ptr pass back address of second question mark
91 endptr pass back address of final ?=
92 dlenptr pass back length of decoded string
93 dptrptr pass back pointer to decoded string
95 Returns: address of =? or NULL if not present
99 decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
100 uschar **endptr, size_t *dlenptr, uschar **dptrptr)
103 for (;; string = mimeword + 2)
108 if ((mimeword = Ustrstr(string, "=?")) == NULL ||
109 (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
110 (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
111 (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;
113 /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
114 length, and that the second field is just one character long. If not,
115 continue the loop to search again. We must start just after the initial =?
116 because we might have found =?xxx=?xxx?xxx?xxx?=. */
118 if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;
120 /* Get the encoding letter, and decode the data string. */
122 encoding = toupper((*q1ptr)[1]);
125 dlen = b64decode(*q2ptr+1, dptrptr);
126 else if (encoding == 'Q')
127 dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
128 **endptr = '?'; /* restore */
130 /* If the decoding succeeded, we are done. Set the length of the decoded
131 string, and pass back the initial pointer. Otherwise, the loop continues. */
135 *dlenptr = (size_t)dlen;
140 /* Control should never actually get here */
145 /*************************************************
146 * Decode and convert an RFC 2047 string *
147 *************************************************/
149 /* There are two functions defined here. The original one was rfc2047_decode()
150 and it was documented in the local_scan() interface. I needed to add an extra
151 argument for use by expand_string(), so I created rfc2047_decode2() for that
152 purpose. The original function became a stub that just supplies NULL for the
153 new argument (sizeptr).
155 An RFC 2047-encoded string may contain one or more "words", each of the
156 form =?...?.?...?= with the first ... specifying the character code, the
157 second being Q (for quoted printable) or B for Base64 encoding. The third ...
160 This function first decodes each "word" into bytes from the Q or B encoding.
161 Then, if provided with the name of a charset encoding, and if iconv() is
162 available, it attempts to translate the result to the named character set.
163 If this fails, the binary string is returned with an error message.
165 If a binary zero is encountered in the decoded string, it is replaced by the
166 contents of the zeroval argument. For use with Exim headers, the value must not
167 be 0 because they are handled as zero-terminated strings. When zeroval==0,
168 lenptr should not be NULL.
171 string the subject string
172 lencheck TRUE to enforce maximum MIME word length
173 target the name of the target encoding for MIME words, or NULL for
174 no charset translation
175 zeroval the value to use for binary zero bytes
176 lenptr if not NULL, the length of the result is returned via
178 sizeptr if not NULL, the length of a new store block in which the
179 result is built is placed here; if no new store is obtained,
180 the value is not changed
181 error for error messages; NULL if no problem; this can be set
182 when the yield is non-NULL if there was a charset
185 Returns: the decoded, converted string, or NULL on error; if there are
186 no MIME words in the string, the original string is returned
190 rfc2047_decode2(uschar *string, BOOL lencheck, const uschar *target,
191 int zeroval, int *lenptr, int *sizeptr, uschar **error)
193 int size = Ustrlen(string);
197 uschar * mimeword, * q1, * q2, * endword;
200 mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
204 if (lenptr) *lenptr = size;
208 /* Scan through the string, decoding MIME words and copying intermediate text,
209 building the result as we go. The result may be longer than the input if it is
210 translated into a multibyte code such as UTF-8. That's why we use the dynamic
211 string building code. */
213 yield = string_get_tainted(++size, string);
219 iconv_t icd = (iconv_t)(-1);
222 if (mimeword != string)
223 yield = string_catn(yield, string, mimeword - string);
224 /*XXX that might have to convert an untainted string to a tainted one */
226 /* Do a charset translation if required. This is supported only on hosts
227 that have the iconv() function. Translation errors set error, but carry on,
228 using the untranslated data. If there is more than one error, the message
229 passed back refers to the final one. We use a loop to cater for the case
230 of long strings - the RFC puts limits on the length, but it's best to be
235 if (target && strcmpic(target, mimeword+2) != 0)
236 if ((icd = iconv_open(CS target, CS(mimeword+2))) == (iconv_t)-1)
237 *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
238 target, mimeword+2, strerror(errno),
239 (errno == EINVAL)? " (maybe unsupported conversion)" : "");
245 uschar *tptr = NULL; /* Stops compiler warning */
250 uschar *outptr = tbuffer;
251 size_t outleft = sizeof(tbuffer);
253 /* If translation is required, go for it. */
255 if (icd != (iconv_t)(-1))
257 (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
259 /* If outptr has been adjusted, there is some output. Set up to add it to
260 the output buffer. The function will have adjusted dptr and dlen. If
261 iconv() stopped because of an error, we'll pick it up next time when
264 If there is no output, we expect there to have been a translation
265 error, because we know there was at least one input byte. We leave the
266 value of tlen as -1, which causes the rest of the input to be copied
269 if (outptr > tbuffer)
272 tlen = outptr - tbuffer;
276 DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
277 "%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
283 /* No charset translation is happening or there was a translation error;
284 just set up the original as the string to be added, and mark it all used.
294 /* Deal with zero values; convert them if requested. */
297 for (int i = 0; i < tlen; i++)
298 if (tptr[i] == 0) tptr[i] = zeroval;
300 /* Add the new string onto the result */
302 yield = string_catn(yield, tptr, tlen);
306 if (icd != (iconv_t)(-1)) iconv_close(icd);
309 /* Update string past the MIME word; skip any white space if the next thing
310 is another MIME word. */
312 string = endword + 2;
313 mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
317 while (isspace(*s)) s++;
318 if (s == mimeword) string = s;
322 /* Copy the remaining characters of the string, zero-terminate it, and return
323 the length as well if requested. */
325 yield = string_cat(yield, string);
327 if (lenptr) *lenptr = yield->ptr;
328 if (sizeptr) *sizeptr = yield->size;
329 return string_from_gstring(yield);
333 /* This is the stub that provides the original interface without the sizeptr
337 rfc2047_decode(uschar *string, BOOL lencheck, const uschar *target, int zeroval,
338 int *lenptr, uschar **error)
340 return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
343 /* End of rfc2047.c */