Jori Hamalainen's patch to speed up exigrep, and fix two typos.

[exim.git] / src / src / parse.c
diff --git a/src/src/parse.c b/src/src/parse.c

index e42e0c8a8e99e3c4388df389ed1b3f5dfe98286b..66764b642216304361b97b93766430910eef1018 100644 (file)
--- a/src/src/parse.c
+++ b/src/src/parse.c
@@ -1,10 +1,10 @@
-/* $Cambridge: exim/src/src/parse.c,v 1.1 2004/10/07 10:39:01 ph10 Exp $ */
+/* $Cambridge: exim/src/src/parse.c,v 1.11 2007/01/08 10:50:18 ph10 Exp $ */
  
  /*************************************************
  *     Exim - an Internet mail transport agent    *
  *************************************************/
  
-/* Copyright (c) University of Cambridge 1995 - 2004 */
+/* Copyright (c) University of Cambridge 1995 - 2007 */
  /* See the file NOTICE for conditions of use and distribution. */
  
  /* Functions for parsing addresses */
@@ -243,18 +243,17 @@ s = skip_comment(s);
  any character except [ ] \, including linear white space, and may contain
  quoted characters. However, RFC 821 restricts literals to being dot-separated
  3-digit numbers, and we make the obvious extension for IPv6. Go for a sequence
-of digits and dots (hex digits and colons for IPv6) here; later this will be
-checked for being a syntactically valid IP address if it ever gets to a router.
+of digits, dots, hex digits, and colons here; later this will be checked for
+being a syntactically valid IP address if it ever gets to a router.
  
-If IPv6 is supported, allow both the formal form, with IPV6: at the start, and
-the informal form without it, and accept IPV4: as well, 'cause someone will use
-it sooner or later. */
+Allow both the formal IPv6 form, with IPV6: at the start, and the informal form
+without it, and accept IPV4: as well, 'cause someone will use it sooner or
+later. */
  
  if (*s == '[')
    {
    *t++ = *s++;
  
-  #if HAVE_IPV6
    if (strncmpic(s, US"IPv6:", 5) == 0 || strncmpic(s, US"IPv4:", 5) == 0)
      {
      memcpy(t, s, 5);
@@ -263,10 +262,6 @@ if (*s == '[')
      }
    while (*s == '.' || *s == ':' || isxdigit(*s)) *t++ = *s++;
  
-  #else
-  while (*s == '.' || isdigit(*s)) *t++ = *s++;
-  #endif
-
    if (*s == ']') *t++ = *s++; else
      {
      *errorptr = US"malformed domain literal";
@@ -602,10 +597,15 @@ which may appear in certain headers. If the flag parse_allow_group is set
  TRUE and parse_found_group is FALSE when this function is called, an address
  which is the start of a group (i.e. preceded by a phrase and a colon) is
  recognized; the phrase is ignored and the flag parse_found_group is set. If
-this flag is TRUE at the end of an address, then if an extraneous semicolon is
-found, it is ignored and the flag is cleared. This logic is used only when
-scanning through addresses in headers, either to fulfil the -t option or for
-rewriting or checking header syntax.
+this flag is TRUE at the end of an address, and if an extraneous semicolon is
+found, it is ignored and the flag is cleared.
+
+This logic is used only when scanning through addresses in headers, either to
+fulfil the -t option, or for rewriting, or for checking header syntax. Because
+the group "state" has to be remembered between multiple calls of this function,
+the variables parse_{allow,found}_group are global. It is important to ensure
+that they are reset to FALSE at the end of scanning a header's list of
+addresses.
  
  Arguments:
    mailbox     points to the RFC822 mailbox
@@ -847,6 +847,11 @@ If the only characters that strictly need quoting are spaces, we return the
  original string, unmodified. If a quoted string is too long for the buffer, it
  is truncated. (This shouldn't happen: this is normally handling short strings.)
  
+Hmmph. As always, things get perverted for other uses. This function was
+originally for the "phrase" part of addresses. Now it is being used for much
+longer texts in ACLs and via the ${rfc2047: expansion item. This means we have
+to check for overlong "encoded-word"s and split them. November 2004.
+
  Arguments:
    string       the string to quote - already checked to contain non-printing
                   chars
@@ -854,6 +859,8 @@ Arguments:
    charset      the name of the character set; NULL => iso-8859-1
    buffer       the buffer to put the answer in
    buffer_size  the size of the buffer
+  fold         if TRUE, a newline is inserted before the separating space when
+                 more than one encoded-word is generated
  
  Returns:       pointer to the original string, if no quoting needed, or
                 pointer to buffer containing the quoted string, or
@@ -863,10 +870,11 @@ Returns:       pointer to the original string, if no quoting needed, or
  
  uschar *
  parse_quote_2047(uschar *string, int len, uschar *charset, uschar *buffer,
-  int buffer_size)
+  int buffer_size, BOOL fold)
  {
  uschar *s = string;
-uschar *t;
+uschar *p, *t;
+int hlen;
  BOOL coded = FALSE;
  
  if (charset == NULL) charset = US"iso-8859-1";
@@ -876,11 +884,26 @@ if (charset == NULL) charset = US"iso-8859-1";
  if (!string_format(buffer, buffer_size, "=?%s?Q?", charset))
    return US"String too long";
  
-t = buffer + Ustrlen(buffer);
+hlen = Ustrlen(buffer);
+t = buffer + hlen;
+p = buffer;
+
  for (; len > 0; len--)
    {
    int ch = *s++;
-  if (t > buffer + buffer_size - 8) break;
+  if (t > buffer + buffer_size - hlen - 8) break;
+
+  if (t - p > 70)
+    {
+    *t++ = '?';
+    *t++ = '=';
+    if (fold) *t++ = '\n';
+    *t++ = ' ';
+    p = t;
+    Ustrncpy(p, buffer, hlen);
+    t += hlen;
+    }
+
    if (ch < 33 || ch > 126 ||
        Ustrchr("?=()<>@,;:\\\".[]_", ch) != NULL)
      {
@@ -893,7 +916,11 @@ for (; len > 0; len--)
      }
    else *t++ = ch;
    }
-sprintf(CS t, "?=");
+
+*t++ = '?';
+*t++ = '=';
+*t = 0;
+
  return coded? buffer : string;
  }
  
@@ -970,7 +997,7 @@ for (i = 0, s = phrase; i < len; i++, s++)
    if ((*s < 32 && *s != '\t') || *s > 126) break;
  
  if (i < len) return parse_quote_2047(phrase, len, headers_charset, buffer,
-  buffer_size);
+  buffer_size, FALSE);
  
  /* No non-printers; use the RFC 822 quoting rules */
  
@@ -1451,7 +1478,7 @@ for (;;)
        {
        *error = string_sprintf("failed to stat included file %s: %s",
          filename, strerror(errno));
-      fclose(f);
+      (void)fclose(f);
        return FF_INCLUDEFAIL;
        }
  
@@ -1478,11 +1505,11 @@ for (;;)
        {
        *error = string_sprintf("error while reading included file %s: %s",
          filename, strerror(errno));
-      fclose(f);
+      (void)fclose(f);
        return FF_ERROR;
        }
      filebuf[statbuf.st_size] = 0;
-    fclose(f);
+    (void)fclose(f);
  
      addr = NULL;
      frc = parse_forward_list(filebuf, options, &addr,
@@ -1637,6 +1664,68 @@ for (;;)
  }
  
  
+
+/*************************************************
+*            Extract a Message-ID                *
+*************************************************/
+
+/* This function is used to extract message ids from In-Reply-To: and
+References: header lines.
+
+Arguments:
+  str          pointer to the start of the message-id
+  yield        put pointer to the message id (in dynamic memory) here
+  error        put error message here on failure
+
+Returns:       points after the processed message-id or NULL on error
+*/
+
+uschar *
+parse_message_id(uschar *str, uschar **yield, uschar **error)
+{
+uschar *domain = NULL;
+uschar *id;
+
+str = skip_comment(str);
+if (*str != '<')
+  {
+  *error = US"Missing '<' before message-id";
+  return NULL;
+  }
+
+/* Getting a block the size of the input string will definitely be sufficient
+for the answer, but it may also be very long if we are processing a header
+line. Therefore, take care to release unwanted store afterwards. */
+
+id = *yield = store_get(Ustrlen(str) + 1);
+*id++ = *str++;
+
+str = read_addr_spec(str, id, '>', error, &domain);
+
+if (*error == NULL)
+  {
+  if (*str != '>') *error = US"Missing '>' after message-id";
+    else if (domain == NULL) *error = US"domain missing in message-id";
+  }
+
+if (*error != NULL)
+  {
+  store_reset(*yield);
+  return NULL;
+  }
+
+while (*id != 0) id++;
+*id++ = *str++;
+*id++ = 0;
+store_reset(id);
+
+str = skip_comment(str);
+return str;
+}
+
+
+
+
  /*************************************************
  **************************************************
  *             Stand-alone test program           *
@@ -1769,6 +1858,26 @@ while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
    else printf("Failed: %d %s\n", extracted, errmess);
    }
  
+printf("Testing parse_message_id\n");
+
+while (Ufgets(buffer, sizeof(buffer), stdin) != NULL)
+  {
+  uschar *s, *t, *errmess;
+  buffer[Ustrlen(buffer) - 1] = 0;
+  if (buffer[0] == 0) break;
+  s = buffer;
+  while (*s != 0)
+    {
+    s = parse_message_id(s, &t, &errmess);
+    if (errmess != NULL)
+      {
+      printf("Failed: %s\n", errmess);
+      break;
+      }
+    printf("%s\n", t);
+    }
+  }
+
  return 0;
  }