src/src/rfc2047.c

   1 /*************************************************
   2 *     Exim - an Internet mail transport agent    *
   3 *************************************************/
   4
   5 /* Copyright (c) University of Cambridge 1995 - 2018 */
   6 /* Copyright (c) The Exim Maintainers 2020 - 2021 */
   7 /* See the file NOTICE for conditions of use and distribution. */
   8
   9 /* This file contains a function for decoding message header lines that may
  10 contain encoded "words" according to the rules described in
  11
  12   RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
  13
  14 The function is a rewritten version of code created by Norihisa Washitake.
  15 The original could be used both inside Exim (as part of a patch) or in a
  16 freestanding form. The original contained some built-in code conversions; I
  17 have chosen only to do code conversions if iconv() is supported by the OS.
  18 Because there were quite a lot of hacks to be done, for a variety of reasons,
  19 I rewrote the code.
  20
  21 You can find the latest version of the original library at
  22
  23   http://washitake.com/mail/exim/mime/
  24
  25 The code below is almost completely unlike the original. */
  26
  27
  28 #include "exim.h"
  29
  30
  31 /*************************************************
  32 *                Do a QP conversion              *
  33 *************************************************/
  34
  35 /* This function decodes "quoted printable" into bytes.
  36
  37 Arguments:
  38   string      the string that includes QP escapes
  39   ptrptr      where to return pointer to the decoded string
  40
  41 Returns:      the length of the decoded string, or -1 on failure
  42 */
  43
  44 static int
  45 rfc2047_qpdecode(uschar *string, uschar **ptrptr)
  46 {
  47 int len = 0;
  48 uschar *ptr;
  49
  50 ptr = *ptrptr = store_get(Ustrlen(string) + 1, is_tainted(string));  /* No longer than this */
  51
  52 while (*string != 0)
  53   {
  54   int ch = *string++;
  55
  56   if (ch == '_') *ptr++ = ' ';
  57   else if (ch == '=')
  58     {
  59     int a = *string;
  60     int b = (a == 0)? 0 : string[1];
  61     if (!isxdigit(a) || !isxdigit(b)) return -1;  /* Bad QP string */
  62     *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
  63                Ustrchr(hex_digits, tolower(b)) - hex_digits;
  64     string += 2;
  65     }
  66   else if (ch == ' ' || ch == '\t') return -1;    /* Whitespace is illegal */
  67   else *ptr++ = ch;
  68
  69   len++;
  70   }
  71
  72 *ptr = 0;
  73 return len;
  74 }
  75
  76
  77
  78 /*************************************************
  79 *            Decode next MIME word               *
  80 *************************************************/
  81
  82 /* Scan a string to see if a MIME word exists; pass back the separator
  83 points in the string.
  84
  85 Arguments:
  86   string     subject string
  87   lencheck   TRUE to enforce maximum length check
  88   q1ptr      pass back address of first question mark
  89   q2ptr      pass back address of second question mark
  90   endptr     pass back address of final ?=
  91   dlenptr    pass back length of decoded string
  92   dptrptr    pass back pointer to decoded string
  93
  94 Returns:     address of =? or NULL if not present
  95 */
  96
  97 static uschar *
  98 decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
  99   uschar **endptr, size_t *dlenptr, uschar **dptrptr)
 100 {
 101 uschar *mimeword;
 102 for (;; string = mimeword + 2)
 103   {
 104   int encoding;
 105   int dlen = -1;
 106
 107   if ((mimeword = Ustrstr(string, "=?"))  == NULL ||
 108       (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
 109       (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
 110       (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;
 111
 112   /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
 113   length, and that the second field is just one character long. If not,
 114   continue the loop to search again. We must start just after the initial =?
 115   because we might have found =?xxx=?xxx?xxx?xxx?=. */
 116
 117   if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;
 118
 119   /* Get the encoding letter, and decode the data string. */
 120
 121   encoding = toupper((*q1ptr)[1]);
 122   **endptr = 0;
 123   if (encoding == 'B')
 124     dlen = b64decode(*q2ptr+1, dptrptr);
 125   else if (encoding == 'Q')
 126     dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
 127   **endptr = '?';   /* restore */
 128
 129   /* If the decoding succeeded, we are done. Set the length of the decoded
 130   string, and pass back the initial pointer. Otherwise, the loop continues. */
 131
 132   if (dlen >= 0)
 133     {
 134     *dlenptr = (size_t)dlen;
 135     return mimeword;
 136     }
 137   }
 138
 139 /* Control should never actually get here */
 140 }
 141
 142
 143
 144 /*************************************************
 145 *    Decode and convert an RFC 2047 string       *
 146 *************************************************/
 147
 148 /* There are two functions defined here. The original one was rfc2047_decode()
 149 and it was documented in the local_scan() interface. I needed to add an extra
 150 argument for use by expand_string(), so I created rfc2047_decode2() for that
 151 purpose. The original function became a stub that just supplies NULL for the
 152 new argument (sizeptr).
 153
 154 An RFC 2047-encoded string may contain one or more "words", each of the
 155 form  =?...?.?...?=  with the first ... specifying the character code, the
 156 second being Q (for quoted printable) or B for Base64 encoding. The third ...
 157 is the actual data.
 158
 159 This function first decodes each "word" into bytes from the Q or B encoding.
 160 Then, if provided with the name of a charset encoding, and if iconv() is
 161 available, it attempts to translate the result to the named character set.
 162 If this fails, the binary string is returned with an error message.
 163
 164 If a binary zero is encountered in the decoded string, it is replaced by the
 165 contents of the zeroval argument. For use with Exim headers, the value must not
 166 be 0 because they are handled as zero-terminated strings. When zeroval==0,
 167 lenptr should not be NULL.
 168
 169 Arguments:
 170     string       the subject string
 171     lencheck     TRUE to enforce maximum MIME word length
 172     target       the name of the target encoding for MIME words, or NULL for
 173                    no charset translation
 174     zeroval      the value to use for binary zero bytes
 175     lenptr       if not NULL, the length of the result is returned via
 176                    this variable
 177     sizeptr      if not NULL, the length of a new store block in which the
 178                    result is built is placed here; if no new store is obtained,
 179                    the value is not changed
 180     error        for error messages; NULL if no problem; this can be set
 181                    when the yield is non-NULL if there was a charset
 182                    translation problem
 183
 184 Returns:         the decoded, converted string, or NULL on error; if there are
 185                    no MIME words in the string, the original string is returned
 186 */
 187
 188 uschar *
 189 rfc2047_decode2(uschar *string, BOOL lencheck, const uschar *target,
 190   int zeroval, int *lenptr, int *sizeptr, uschar **error)
 191 {
 192 int size = Ustrlen(string);
 193 size_t dlen;
 194 uschar *dptr;
 195 gstring *yield;
 196 uschar *mimeword, *q1, *q2, *endword;
 197
 198 *error = NULL;
 199 mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
 200
 201 if (!mimeword)
 202   {
 203   if (lenptr) *lenptr = size;
 204   return string;
 205   }
 206
 207 /* Scan through the string, decoding MIME words and copying intermediate text,
 208 building the result as we go. The result may be longer than the input if it is
 209 translated into a multibyte code such as UTF-8. That's why we use the dynamic
 210 string building code. */
 211
 212 yield = store_get(sizeof(gstring) + ++size, is_tainted(string));
 213 yield->size = size;
 214 yield->ptr = 0;
 215 yield->s = US(yield + 1);
 216
 217 while (mimeword)
 218   {
 219
 220   #if HAVE_ICONV
 221   iconv_t icd = (iconv_t)(-1);
 222   #endif
 223
 224   if (mimeword != string)
 225     yield = string_catn(yield, string, mimeword - string);
 226 /*XXX that might have to convert an untainted string to a tainted one */
 227
 228   /* Do a charset translation if required. This is supported only on hosts
 229   that have the iconv() function. Translation errors set error, but carry on,
 230   using the untranslated data. If there is more than one error, the message
 231   passed back refers to the final one. We use a loop to cater for the case
 232   of long strings - the RFC puts limits on the length, but it's best to be
 233   robust. */
 234
 235   #if HAVE_ICONV
 236   *q1 = 0;
 237   if (target && strcmpic(target, mimeword+2) != 0)
 238     if ((icd = iconv_open(CS target, CS(mimeword+2))) == (iconv_t)-1)
 239       *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
 240         target, mimeword+2, strerror(errno),
 241         (errno == EINVAL)? " (maybe unsupported conversion)" : "");
 242   *q1 = '?';
 243   #endif
 244
 245   while (dlen > 0)
 246     {
 247     uschar *tptr = NULL;   /* Stops compiler warning */
 248     int tlen = -1;
 249
 250     #if HAVE_ICONV
 251     uschar tbuffer[256];
 252     uschar *outptr = tbuffer;
 253     size_t outleft = sizeof(tbuffer);
 254
 255     /* If translation is required, go for it. */
 256
 257     if (icd != (iconv_t)(-1))
 258       {
 259       (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
 260
 261       /* If outptr has been adjusted, there is some output. Set up to add it to
 262       the output buffer. The function will have adjusted dptr and dlen. If
 263       iconv() stopped because of an error, we'll pick it up next time when
 264       there's no output.
 265
 266       If there is no output, we expect there to have been a translation
 267       error, because we know there was at least one input byte. We leave the
 268       value of tlen as -1, which causes the rest of the input to be copied
 269       verbatim. */
 270
 271       if (outptr > tbuffer)
 272         {
 273         tptr = tbuffer;
 274         tlen = outptr - tbuffer;
 275         }
 276       else
 277         {
 278         DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
 279         "%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
 280         }
 281       }
 282
 283     #endif
 284
 285     /* No charset translation is happening or there was a translation error;
 286     just set up the original as the string to be added, and mark it all used.
 287     */
 288
 289     if (tlen == -1)
 290       {
 291       tptr = dptr;
 292       tlen = dlen;
 293       dlen = 0;
 294       }
 295
 296     /* Deal with zero values; convert them if requested. */
 297
 298     if (zeroval != 0)
 299       for (int i = 0; i < tlen; i++)
 300         if (tptr[i] == 0) tptr[i] = zeroval;
 301
 302     /* Add the new string onto the result */
 303
 304     yield = string_catn(yield, tptr, tlen);
 305     }
 306
 307   #if HAVE_ICONV
 308   if (icd != (iconv_t)(-1))  iconv_close(icd);
 309   #endif
 310
 311   /* Update string past the MIME word; skip any white space if the next thing
 312   is another MIME word. */
 313
 314   string = endword + 2;
 315   mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
 316   if (mimeword)
 317     {
 318     uschar *s = string;
 319     while (isspace(*s)) s++;
 320     if (s == mimeword) string = s;
 321     }
 322   }
 323
 324 /* Copy the remaining characters of the string, zero-terminate it, and return
 325 the length as well if requested. */
 326
 327 yield = string_cat(yield, string);
 328
 329 if (lenptr) *lenptr = yield->ptr;
 330 if (sizeptr) *sizeptr = yield->size;
 331 return string_from_gstring(yield);
 332 }
 333
 334
 335 /* This is the stub that provides the original interface without the sizeptr
 336 argument. */
 337
 338 uschar *
 339 rfc2047_decode(uschar *string, BOOL lencheck, const uschar *target, int zeroval,
 340   int *lenptr, uschar **error)
 341 {
 342 return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
 343 }
 344
 345 /* End of rfc2047.c */