src/src/rfc2047.c

   1 /* $Cambridge: exim/src/src/rfc2047.c,v 1.4 2007/01/08 10:50:18 ph10 Exp $ */
   2
   3 /*************************************************
   4 *     Exim - an Internet mail transport agent    *
   5 *************************************************/
   6
   7 /* Copyright (c) University of Cambridge 1995 - 2007 */
   8 /* See the file NOTICE for conditions of use and distribution. */
   9
  10 /* This file contains a function for decoding message header lines that may
  11 contain encoded "words" according to the rules described in
  12
  13   RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
  14
  15 The function is a rewritten version of code created by Norihisa Washitake.
  16 The original could be used both inside Exim (as part of a patch) or in a
  17 freestanding form. The original contained some built-in code conversions; I
  18 have chosen only to do code conversions if iconv() is supported by the OS.
  19 Because there were quite a lot of hacks to be done, for a variety of reasons,
  20 I rewrote the code.
  21
  22 You can find the latest version of the original library at
  23
  24   http://washitake.com/mail/exim/mime/
  25
  26 The code below is almost completely unlike the original. */
  27
  28
  29 #include "exim.h"
  30
  31
  32 /*************************************************
  33 *                Do a QP conversion              *
  34 *************************************************/
  35
  36 /* This function decodes "quoted printable" into bytes.
  37
  38 Arguments:
  39   string      the string that includes QP escapes
  40   ptrptr      where to return pointer to the decoded string
  41
  42 Returns:      the length of the decoded string, or -1 on failure
  43 */
  44
  45 static int
  46 rfc2047_qpdecode(uschar *string, uschar **ptrptr)
  47 {
  48 int len = 0;
  49 uschar *ptr;
  50
  51 ptr = *ptrptr = store_get(Ustrlen(string) + 1);  /* No longer than this */
  52
  53 while (*string != 0)
  54   {
  55   register int ch = *string++;
  56
  57   if (ch == '_') *ptr++ = ' ';
  58   else if (ch == '=')
  59     {
  60     int a = *string;
  61     int b = (a == 0)? 0 : string[1];
  62     if (!isxdigit(a) || !isxdigit(b)) return -1;  /* Bad QP string */
  63     *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
  64                Ustrchr(hex_digits, tolower(b)) - hex_digits;
  65     string += 2;
  66     }
  67   else if (ch == ' ' || ch == '\t') return -1;    /* Whitespace is illegal */
  68   else *ptr++ = ch;
  69
  70   len++;
  71   }
  72
  73 *ptr = 0;
  74 return len;
  75 }
  76
  77
  78
  79 /*************************************************
  80 *            Decode next MIME word               *
  81 *************************************************/
  82
  83 /* Scan a string to see if a MIME word exists; pass back the separator
  84 points in the string.
  85
  86 Arguments:
  87   string     subject string
  88   lencheck   TRUE to enforce maximum length check
  89   q1ptr      pass back address of first question mark
  90   q2ptr      pass back address of second question mark
  91   endptr     pass back address of final ?=
  92   dlenptr    pass back length of decoded string
  93   dptrptr    pass back pointer to decoded string
  94
  95 Returns:     address of =? or NULL if not present
  96 */
  97
  98 static uschar *
  99 decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
 100   uschar **endptr, size_t *dlenptr, uschar **dptrptr)
 101 {
 102 uschar *mimeword;
 103 for (;; string = mimeword + 2)
 104   {
 105   int encoding;
 106   int dlen = -1;
 107
 108   if ((mimeword = Ustrstr(string, "=?"))  == NULL ||
 109       (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
 110       (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
 111       (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;
 112
 113   /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
 114   length, and that the second field is just one character long. If not,
 115   continue the loop to search again. We must start just after the initial =?
 116   because we might have found =?xxx=?xxx?xxx?xxx?=. */
 117
 118   if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;
 119
 120   /* Get the encoding letter, and decode the data string. */
 121
 122   encoding = toupper((*q1ptr)[1]);
 123   **endptr = 0;
 124   if (encoding == 'B')
 125     dlen = auth_b64decode(*q2ptr+1, dptrptr);
 126   else if (encoding == 'Q')
 127     dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
 128   **endptr = '?';   /* restore */
 129
 130   /* If the decoding succeeded, we are done. Set the length of the decoded
 131   string, and pass back the initial pointer. Otherwise, the loop continues. */
 132
 133   if (dlen >= 0)
 134     {
 135     *dlenptr = (size_t)dlen;
 136     return mimeword;
 137     }
 138   }
 139
 140 /* Control should never actually get here */
 141 }
 142
 143
 144
 145 /*************************************************
 146 *    Decode and convert an RFC 2047 string       *
 147 *************************************************/
 148
 149 /* There are two functions defined here. The original one was rfc2047_decode()
 150 and it was documented in the local_scan() interface. I needed to add an extra
 151 argument for use by expand_string(), so I created rfc2047_decode2() for that
 152 purpose. The original function became a stub that just supplies NULL for the
 153 new argument (sizeptr).
 154
 155 An RFC 2047-encoded string may contain one or more "words", each of the
 156 form  =?...?.?...?=  with the first ... specifying the character code, the
 157 second being Q (for quoted printable) or B for Base64 encoding. The third ...
 158 is the actual data.
 159
 160 This function first decodes each "word" into bytes from the Q or B encoding.
 161 Then, if provided with the name of a charset encoding, and if iconv() is
 162 available, it attempts to translate the result to the named character set.
 163 If this fails, the binary string is returned with an error message.
 164
 165 If a binary zero is encountered in the decoded string, it is replaced by the
 166 contents of the zeroval argument. For use with Exim headers, the value must not
 167 be 0 because they are handled as zero-terminated strings. When zeroval==0,
 168 lenptr should not be NULL.
 169
 170 Arguments:
 171     string       the subject string
 172     lencheck     TRUE to enforce maximum MIME word length
 173     target       the name of the target encoding for MIME words, or NULL for
 174                    no charset translation
 175     zeroval      the value to use for binary zero bytes
 176     lenptr       if not NULL, the length of the result is returned via
 177                    this variable
 178     sizeptr      if not NULL, the length of a new store block in which the
 179                    result is built is placed here; if no new store is obtained,
 180                    the value is not changed
 181     error        for error messages; NULL if no problem; this can be set
 182                    when the yield is non-NULL if there was a charset
 183                    translation problem
 184
 185 Returns:         the decoded, converted string, or NULL on error; if there are
 186                    no MIME words in the string, the original string is returned
 187 */
 188
 189 uschar *
 190 rfc2047_decode2(uschar *string, BOOL lencheck, uschar *target, int zeroval,
 191   int *lenptr, int *sizeptr, uschar **error)
 192 {
 193 int ptr = 0;
 194 int size = Ustrlen(string);
 195 size_t dlen;
 196 uschar *dptr, *yield;
 197 uschar *mimeword, *q1, *q2, *endword;
 198
 199 *error = NULL;
 200 mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
 201
 202 if (mimeword == NULL)
 203   {
 204   if (lenptr != NULL) *lenptr = size;
 205   return string;
 206   }
 207
 208 /* Scan through the string, decoding MIME words and copying intermediate text,
 209 building the result as we go. The result may be longer than the input if it is
 210 translated into a multibyte code such as UTF-8. That's why we use the dynamic
 211 string building code. */
 212
 213 yield = store_get(++size);
 214
 215 while (mimeword != NULL)
 216   {
 217
 218   #if HAVE_ICONV
 219   iconv_t icd = (iconv_t)(-1);
 220   #endif
 221
 222   if (mimeword != string)
 223     yield = string_cat(yield, &size, &ptr, string, mimeword - string);
 224
 225   /* Do a charset translation if required. This is supported only on hosts
 226   that have the iconv() function. Translation errors set error, but carry on,
 227   using the untranslated data. If there is more than one error, the message
 228   passed back refers to the final one. We use a loop to cater for the case
 229   of long strings - the RFC puts limits on the length, but it's best to be
 230   robust. */
 231
 232   #if HAVE_ICONV
 233   *q1 = 0;
 234   if (target != NULL && strcmpic(target, mimeword+2) != 0)
 235     {
 236     icd = iconv_open(CS target, CS(mimeword+2));
 237
 238     if (icd == (iconv_t)(-1))
 239       {
 240       *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
 241         target, mimeword+2, strerror(errno),
 242         (errno == EINVAL)? " (maybe unsupported conversion)" : "");
 243       }
 244     }
 245   *q1 = '?';
 246   #endif
 247
 248   while (dlen > 0)
 249     {
 250     uschar *tptr = NULL;   /* Stops compiler warning */
 251     int tlen = -1;
 252
 253     #if HAVE_ICONV
 254     uschar tbuffer[256];
 255     uschar *outptr = tbuffer;
 256     size_t outleft = sizeof(tbuffer);
 257
 258     /* If translation is required, go for it. */
 259
 260     if (icd != (iconv_t)(-1))
 261       {
 262       (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
 263
 264       /* If outptr has been adjusted, there is some output. Set up to add it to
 265       the output buffer. The function will have adjusted dptr and dlen. If
 266       iconv() stopped because of an error, we'll pick it up next time when
 267       there's no output.
 268
 269       If there is no output, we expect there to have been a translation
 270       error, because we know there was at least one input byte. We leave the
 271       value of tlen as -1, which causes the rest of the input to be copied
 272       verbatim. */
 273
 274       if (outptr > tbuffer)
 275         {
 276         tptr = tbuffer;
 277         tlen = outptr - tbuffer;
 278         }
 279       else
 280         {
 281         DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
 282         "%s\n", endword + 2 - mimeword, mimeword, target, strerror(errno));
 283         }
 284       }
 285
 286     #endif
 287
 288     /* No charset translation is happening or there was a translation error;
 289     just set up the original as the string to be added, and mark it all used.
 290     */
 291
 292     if (tlen == -1)
 293       {
 294       tptr = dptr;
 295       tlen = dlen;
 296       dlen = 0;
 297       }
 298
 299     /* Deal with zero values; convert them if requested. */
 300
 301     if (zeroval != 0)
 302       {
 303       int i;
 304       for (i = 0; i < tlen; i++)
 305         if (tptr[i] == 0) tptr[i] = zeroval;
 306       }
 307
 308     /* Add the new string onto the result */
 309
 310     yield = string_cat(yield, &size, &ptr, tptr, tlen);
 311     }
 312
 313   #if HAVE_ICONV
 314   if (icd != (iconv_t)(-1))  iconv_close(icd);
 315   #endif
 316
 317   /* Update string past the MIME word; skip any white space if the next thing
 318   is another MIME word. */
 319
 320   string = endword + 2;
 321   mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
 322   if (mimeword != NULL)
 323     {
 324     uschar *s = string;
 325     while (isspace(*s)) s++;
 326     if (s == mimeword) string = s;
 327     }
 328   }
 329
 330 /* Copy the remaining characters of the string, zero-terminate it, and return
 331 the length as well if requested. */
 332
 333 yield = string_cat(yield, &size, &ptr, string, Ustrlen(string));
 334 yield[ptr] = 0;
 335 if (lenptr != NULL) *lenptr = ptr;
 336 if (sizeptr != NULL) *sizeptr = size;
 337 return yield;
 338 }
 339
 340
 341 /* This is the stub that provides the original interface without the sizeptr
 342 argument. */
 343
 344 uschar *
 345 rfc2047_decode(uschar *string, BOOL lencheck, uschar *target, int zeroval,
 346   int *lenptr, uschar **error)
 347 {
 348 return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
 349 }
 350
 351 /* End of rfc2047.c */