src/src/rfc2047.c

   1 /*************************************************
   2 *     Exim - an Internet mail transport agent    *
   3 *************************************************/
   4
   5 /* Copyright (c) University of Cambridge 1995 - 2018 */
   6 /* See the file NOTICE for conditions of use and distribution. */
   7
   8 /* This file contains a function for decoding message header lines that may
   9 contain encoded "words" according to the rules described in
  10
  11   RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
  12
  13 The function is a rewritten version of code created by Norihisa Washitake.
  14 The original could be used both inside Exim (as part of a patch) or in a
  15 freestanding form. The original contained some built-in code conversions; I
  16 have chosen only to do code conversions if iconv() is supported by the OS.
  17 Because there were quite a lot of hacks to be done, for a variety of reasons,
  18 I rewrote the code.
  19
  20 You can find the latest version of the original library at
  21
  22   http://washitake.com/mail/exim/mime/
  23
  24 The code below is almost completely unlike the original. */
  25
  26
  27 #include "exim.h"
  28
  29
  30 /*************************************************
  31 *                Do a QP conversion              *
  32 *************************************************/
  33
  34 /* This function decodes "quoted printable" into bytes.
  35
  36 Arguments:
  37   string      the string that includes QP escapes
  38   ptrptr      where to return pointer to the decoded string
  39
  40 Returns:      the length of the decoded string, or -1 on failure
  41 */
  42
  43 static int
  44 rfc2047_qpdecode(uschar *string, uschar **ptrptr)
  45 {
  46 int len = 0;
  47 uschar *ptr;
  48
  49 ptr = *ptrptr = store_get(Ustrlen(string) + 1, is_tainted(string));  /* No longer than this */
  50
  51 while (*string != 0)
  52   {
  53   int ch = *string++;
  54
  55   if (ch == '_') *ptr++ = ' ';
  56   else if (ch == '=')
  57     {
  58     int a = *string;
  59     int b = (a == 0)? 0 : string[1];
  60     if (!isxdigit(a) || !isxdigit(b)) return -1;  /* Bad QP string */
  61     *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
  62                Ustrchr(hex_digits, tolower(b)) - hex_digits;
  63     string += 2;
  64     }
  65   else if (ch == ' ' || ch == '\t') return -1;    /* Whitespace is illegal */
  66   else *ptr++ = ch;
  67
  68   len++;
  69   }
  70
  71 *ptr = 0;
  72 return len;
  73 }
  74
  75
  76
  77 /*************************************************
  78 *            Decode next MIME word               *
  79 *************************************************/
  80
  81 /* Scan a string to see if a MIME word exists; pass back the separator
  82 points in the string.
  83
  84 Arguments:
  85   string     subject string
  86   lencheck   TRUE to enforce maximum length check
  87   q1ptr      pass back address of first question mark
  88   q2ptr      pass back address of second question mark
  89   endptr     pass back address of final ?=
  90   dlenptr    pass back length of decoded string
  91   dptrptr    pass back pointer to decoded string
  92
  93 Returns:     address of =? or NULL if not present
  94 */
  95
  96 static uschar *
  97 decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
  98   uschar **endptr, size_t *dlenptr, uschar **dptrptr)
  99 {
 100 uschar *mimeword;
 101 for (;; string = mimeword + 2)
 102   {
 103   int encoding;
 104   int dlen = -1;
 105
 106   if ((mimeword = Ustrstr(string, "=?"))  == NULL ||
 107       (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
 108       (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
 109       (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;
 110
 111   /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
 112   length, and that the second field is just one character long. If not,
 113   continue the loop to search again. We must start just after the initial =?
 114   because we might have found =?xxx=?xxx?xxx?xxx?=. */
 115
 116   if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;
 117
 118   /* Get the encoding letter, and decode the data string. */
 119
 120   encoding = toupper((*q1ptr)[1]);
 121   **endptr = 0;
 122   if (encoding == 'B')
 123     dlen = b64decode(*q2ptr+1, dptrptr);
 124   else if (encoding == 'Q')
 125     dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
 126   **endptr = '?';   /* restore */
 127
 128   /* If the decoding succeeded, we are done. Set the length of the decoded
 129   string, and pass back the initial pointer. Otherwise, the loop continues. */
 130
 131   if (dlen >= 0)
 132     {
 133     *dlenptr = (size_t)dlen;
 134     return mimeword;
 135     }
 136   }
 137
 138 /* Control should never actually get here */
 139 }
 140
 141
 142
 143 /*************************************************
 144 *    Decode and convert an RFC 2047 string       *
 145 *************************************************/
 146
 147 /* There are two functions defined here. The original one was rfc2047_decode()
 148 and it was documented in the local_scan() interface. I needed to add an extra
 149 argument for use by expand_string(), so I created rfc2047_decode2() for that
 150 purpose. The original function became a stub that just supplies NULL for the
 151 new argument (sizeptr).
 152
 153 An RFC 2047-encoded string may contain one or more "words", each of the
 154 form  =?...?.?...?=  with the first ... specifying the character code, the
 155 second being Q (for quoted printable) or B for Base64 encoding. The third ...
 156 is the actual data.
 157
 158 This function first decodes each "word" into bytes from the Q or B encoding.
 159 Then, if provided with the name of a charset encoding, and if iconv() is
 160 available, it attempts to translate the result to the named character set.
 161 If this fails, the binary string is returned with an error message.
 162
 163 If a binary zero is encountered in the decoded string, it is replaced by the
 164 contents of the zeroval argument. For use with Exim headers, the value must not
 165 be 0 because they are handled as zero-terminated strings. When zeroval==0,
 166 lenptr should not be NULL.
 167
 168 Arguments:
 169     string       the subject string
 170     lencheck     TRUE to enforce maximum MIME word length
 171     target       the name of the target encoding for MIME words, or NULL for
 172                    no charset translation
 173     zeroval      the value to use for binary zero bytes
 174     lenptr       if not NULL, the length of the result is returned via
 175                    this variable
 176     sizeptr      if not NULL, the length of a new store block in which the
 177                    result is built is placed here; if no new store is obtained,
 178                    the value is not changed
 179     error        for error messages; NULL if no problem; this can be set
 180                    when the yield is non-NULL if there was a charset
 181                    translation problem
 182
 183 Returns:         the decoded, converted string, or NULL on error; if there are
 184                    no MIME words in the string, the original string is returned
 185 */
 186
 187 uschar *
 188 rfc2047_decode2(uschar *string, BOOL lencheck, uschar *target, int zeroval,
 189   int *lenptr, int *sizeptr, uschar **error)
 190 {
 191 int size = Ustrlen(string);
 192 size_t dlen;
 193 uschar *dptr;
 194 gstring *yield;
 195 uschar *mimeword, *q1, *q2, *endword;
 196
 197 *error = NULL;
 198 mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
 199
 200 if (!mimeword)
 201   {
 202   if (lenptr) *lenptr = size;
 203   return string;
 204   }
 205
 206 /* Scan through the string, decoding MIME words and copying intermediate text,
 207 building the result as we go. The result may be longer than the input if it is
 208 translated into a multibyte code such as UTF-8. That's why we use the dynamic
 209 string building code. */
 210
 211 yield = store_get(sizeof(gstring) + ++size, is_tainted(string));
 212 yield->size = size;
 213 yield->ptr = 0;
 214 yield->s = US(yield + 1);
 215
 216 while (mimeword)
 217   {
 218
 219   #if HAVE_ICONV
 220   iconv_t icd = (iconv_t)(-1);
 221   #endif
 222
 223   if (mimeword != string)
 224     yield = string_catn(yield, string, mimeword - string);
 225 /*XXX that might have to convert an untainted string to a tainted one */
 226
 227   /* Do a charset translation if required. This is supported only on hosts
 228   that have the iconv() function. Translation errors set error, but carry on,
 229   using the untranslated data. If there is more than one error, the message
 230   passed back refers to the final one. We use a loop to cater for the case
 231   of long strings - the RFC puts limits on the length, but it's best to be
 232   robust. */
 233
 234   #if HAVE_ICONV
 235   *q1 = 0;
 236   if (target != NULL && strcmpic(target, mimeword+2) != 0)
 237     {
 238     icd = iconv_open(CS target, CS(mimeword+2));
 239
 240     if (icd == (iconv_t)(-1))
 241       {
 242       *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
 243         target, mimeword+2, strerror(errno),
 244         (errno == EINVAL)? " (maybe unsupported conversion)" : "");
 245       }
 246     }
 247   *q1 = '?';
 248   #endif
 249
 250   while (dlen > 0)
 251     {
 252     uschar *tptr = NULL;   /* Stops compiler warning */
 253     int tlen = -1;
 254
 255     #if HAVE_ICONV
 256     uschar tbuffer[256];
 257     uschar *outptr = tbuffer;
 258     size_t outleft = sizeof(tbuffer);
 259
 260     /* If translation is required, go for it. */
 261
 262     if (icd != (iconv_t)(-1))
 263       {
 264       (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
 265
 266       /* If outptr has been adjusted, there is some output. Set up to add it to
 267       the output buffer. The function will have adjusted dptr and dlen. If
 268       iconv() stopped because of an error, we'll pick it up next time when
 269       there's no output.
 270
 271       If there is no output, we expect there to have been a translation
 272       error, because we know there was at least one input byte. We leave the
 273       value of tlen as -1, which causes the rest of the input to be copied
 274       verbatim. */
 275
 276       if (outptr > tbuffer)
 277         {
 278         tptr = tbuffer;
 279         tlen = outptr - tbuffer;
 280         }
 281       else
 282         {
 283         DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
 284         "%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
 285         }
 286       }
 287
 288     #endif
 289
 290     /* No charset translation is happening or there was a translation error;
 291     just set up the original as the string to be added, and mark it all used.
 292     */
 293
 294     if (tlen == -1)
 295       {
 296       tptr = dptr;
 297       tlen = dlen;
 298       dlen = 0;
 299       }
 300
 301     /* Deal with zero values; convert them if requested. */
 302
 303     if (zeroval != 0)
 304       for (int i = 0; i < tlen; i++)
 305         if (tptr[i] == 0) tptr[i] = zeroval;
 306
 307     /* Add the new string onto the result */
 308
 309     yield = string_catn(yield, tptr, tlen);
 310     }
 311
 312   #if HAVE_ICONV
 313   if (icd != (iconv_t)(-1))  iconv_close(icd);
 314   #endif
 315
 316   /* Update string past the MIME word; skip any white space if the next thing
 317   is another MIME word. */
 318
 319   string = endword + 2;
 320   mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
 321   if (mimeword)
 322     {
 323     uschar *s = string;
 324     while (isspace(*s)) s++;
 325     if (s == mimeword) string = s;
 326     }
 327   }
 328
 329 /* Copy the remaining characters of the string, zero-terminate it, and return
 330 the length as well if requested. */
 331
 332 yield = string_cat(yield, string);
 333
 334 if (lenptr) *lenptr = yield->ptr;
 335 if (sizeptr) *sizeptr = yield->size;
 336 return string_from_gstring(yield);
 337 }
 338
 339
 340 /* This is the stub that provides the original interface without the sizeptr
 341 argument. */
 342
 343 uschar *
 344 rfc2047_decode(uschar *string, BOOL lencheck, uschar *target, int zeroval,
 345   int *lenptr, uschar **error)
 346 {
 347 return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
 348 }
 349
 350 /* End of rfc2047.c */