Generate ePub files of documentation.

[exim.git] / doc / doc-docbook / Pre-xml
diff --git a/doc/doc-docbook/Pre-xml b/doc/doc-docbook/Pre-xml

index 113e6f9d0bf30b3522526c57e70fa274d662c943..22b343d8db8b234b95ed28bf35b952f478ff6916 100755 (executable)
--- a/doc/doc-docbook/Pre-xml
+++ b/doc/doc-docbook/Pre-xml
@@ -1,38 +1,47 @@
  #! /usr/bin/perl
  
-# $Cambridge: exim/doc/doc-docbook/Pre-xml,v 1.1 2005/06/16 10:32:31 ph10 Exp $
-
  # Script to pre-process XML input before processing it for various purposes.
  # Options specify which transformations are to be done. Monospaced literal
  # layout blocks are never touched.
  
  # Changes:
  
-# -abstract: Remove the <abstract> element
-
-# -ascii:    Replace &8230;   (sic, no x) with ...
-#            Replace &#x2019; by '
-#            Replace &#x201C; by "
-#            Replace &#x201D; by "
-#            Replace &#x2013; by -
-#            Replace &#x2020; by *
-#            Replace &#x2021; by **
-#            Replace &#x00a0; by a space
-#            Replace &#169;   by (c)
-#            Put quotes round <literal> text
+# -ascii:    Replace &#x2019; by '
+#            Replace &copy;   by (c)
+#            Replace &dagger; by *
+#            Replace &Dagger; by **
+#            Replace &nbsp;   by a space
+#            Replace &ndash;  by -
  #            Put quotes round <quote> text
-
+#
+# -quoteliteral:
+#            Put quotes round <literal> text
+#
  # -bookinfo: Remove the <bookinfo> element from the file
-
+#
  # -fi:       Replace "fi" by &#xFB01; except when it is in an XML element, or
  #            inside a <literal>.
-
-# -noindex   Remove the XML to generate a Concept and an Options index.
+#
+# -html:     Certain things are done only for HTML output:
+#
+#            If <literallayout> is followed by optional space and then a
+#            newline, the space and newline are removed, because otherwise you
+#            get a blank line in the HTML output.
+#
+# -noindex   Remove the XML that generates indexes.
  # -oneindex  Ditto, but add XML to generate a single index.
+#
+# -optbreak  Insert an optional line break (zero-width space, &#x200B;) after
+#            every underscore in text within <option> and <variable> elements,
+#            except when preceded by <entry> (i.e. not in tables). The same is
+#            also done within a word of four or more upper-case letters (for
+#            compile-time options).
+#
+# -epub      Convert date formats to comply with epub specification
+#
  
  
-
-# The function that processes non-literal monospaced text
+# The function that processes non-literal, non-monospaced text
  
  sub process()
  {
@@ -40,17 +49,23 @@ my($s) = $_[0];
  
  $s =~ s/fi(?![^<>]*>)/&#xFB01;/g if $ligatures;
  
+if ($optbreak)
+  {
+  $s =~ s%(?<!<entry>)(<option>|<varname>)([^<]+)%
+    my($x,$y) = ($1,$2); $y =~ s/_/_&#x200B;/g; "$x"."$y"%gex;
+
+  $s =~ s?\b([A-Z_]{4,})\b?
+    my($x) = $1; $x =~ s/_/_&#x200B;/g; "$x"?gex;
+  }
+
  if ($ascii)
    {
-  $s =~ s/&#8230;/.../g;
    $s =~ s/&#x2019;/'/g;
-  $s =~ s/&#x201C;/"/g;
-  $s =~ s/&#x201D;/"/g;
-  $s =~ s/&#x2013;/-/g;
-  $s =~ s/&#x2020;/*/g;
-  $s =~ s/&#x2021;/**/g;
-  $s =~ s/&#x00a0;/ /g;
-  $s =~ s/&#x00a9;/(c)/g;
+  $s =~ s/&copy;/(c)/g;
+  $s =~ s/&dagger;/*/g;
+  $s =~ s/&Dagger;/**/g;
+  $s =~ s/&nsbp;/ /g;
+  $s =~ s/&ndash;/-/g;
    $s =~ s/<quote>/"/g;
    $s =~ s/<\/quote>/"/g;
    }
@@ -58,35 +73,44 @@ if ($ascii)
  $s;
  }
  
+# Mapping needed for epub
+
+my %months = (
+  jan => '01', feb => '02', mar => '03', apr => '04', may => '05', jun => '06',
+  jul => '07', aug => '08', sep => '09', oct => '10', nov => '11', dec => '12',
+);
  
  # The main program
  
-$abstract  = 0;
  $ascii     = 0;
  $bookinfo  = 0;
+$epub      = 0;
+$html      = 0;
  $inliteral = 0;
+$inliterallayout = 0;
  $ligatures = 0;
  $madeindex = 0;
  $noindex   = 0;
  $oneindex  = 0;
+$optbreak  = 0;
+$quoteliteral = 0;
  
  foreach $arg (@ARGV)
    {
    if    ($arg eq "-fi")       { $ligatures = 1; }
-  elsif ($arg eq "-abstract") { $abstract = 1; }
    elsif ($arg eq "-ascii")    { $ascii = 1; }
    elsif ($arg eq "-bookinfo") { $bookinfo = 1; }
+  elsif ($arg eq "-epub")     { $epub = 1; }
+  elsif ($arg eq "-html")     { $html = 1; }
    elsif ($arg eq "-noindex")  { $noindex = 1; }
    elsif ($arg eq "-oneindex") { $oneindex = 1; }
+  elsif ($arg eq "-optbreak") { $optbreak = 1; }
+  elsif ($arg eq "-quoteliteral") { $quoteliteral = 1; }
    else  { die "** Pre-xml: Unknown option \"$arg\"\n"; }
    }
  
  while (<STDIN>)
    {
-  # Remove <abstract> if required
-
-  next if ($abstract && /^\s*<abstract>/);
-
    # Remove <bookinfo> if required
  
    if ($bookinfo && /^<bookinfo/)
@@ -99,6 +123,7 @@ while (<STDIN>)
  
    if (/^<literallayout class="monospaced">/)
      {
+    $_ = substr($_, 0, -1) if $html;
      print;
      while (<STDIN>)
        {
@@ -126,18 +151,42 @@ while (<STDIN>)
      next;
      }
  
+  # Adjust dates to YYYY-MM-DD
+
+  if ($epub && /^\s*<date[\s>]/)
+    {
+    print;
+    while (<STDIN>)
+      {
+      last if /^\s*<\/date>/;
+
+      if (/^ \s* (\d{1,2}) \s+ ([a-zA-Z]{3}) \s+ (\d{4})/x)
+        {
+        die "Unknown month '$2'\n" unless exists $months{lc $2};
+        my $month = $months{lc $2};
+        s//$3-$month-$1/;
+        }
+      print;
+      }
+    }
+
    # A line that is not in a monospaced literal block; keep track of which
    # parts are in <literal> and which not. The latter get processed by the
-  # function above.
+  # function above. Items in <literal> get quoted unless they are also in
+  # a <literallayout> block, or are already being quoted.
  
    for (;;)
      {
+    $_ = substr($_, 0, -1) if $html && /^<literallayout[^>]*>\s*\n$/;
+    $inliterallayout = 1 if /^<literallayout/;
+    $inliterallayout = 0 if /^<\/literallayout/;
+
      if ($inliteral)
        {
-      if (/^(.*?)<\/literal>(.*)$/)
+      if (/^(.*?)<\/literal>(?!<\/quote>)(.*)$/)
          {
          print $1;
-        print "\"" if $ascii;
+        print "\"" if $quoteliteral && !$inliterallayout;
          print "</literal>";
          $inliteral = 0;
          $_ = "$2\n";
@@ -153,11 +202,11 @@ while (<STDIN>)
  
      else
        {
-      if (/^(.*?)<literal>(.*)$/)
+      if (/^(.*?)(?<!<quote>)<literal>(.*)$/)
          {
          print &process($1);
          print "<literal>";
-        print "\"" if $ascii;
+        print "\"" if $quoteliteral && !$inliterallayout;
          $inliteral = 1;
          $_ = "$2\n";
          }