doc/doc-docbook/Pre-xml

   1 #! /usr/bin/perl
   2
   3 # $Cambridge: exim/doc/doc-docbook/Pre-xml,v 1.4 2007/04/11 15:26:09 ph10 Exp $
   4
   5 # Script to pre-process XML input before processing it for various purposes.
   6 # Options specify which transformations are to be done. Monospaced literal
   7 # layout blocks are never touched.
   8
   9 # Changes:
  10
  11 # -ascii:    Replace &#x2019; by '
  12 #            Replace &copy;   by (c)
  13 #            Replace &dagger; by *
  14 #            Replace &Dagger; by **
  15 #            Replace &nbsp;   by a space
  16 #            Replace &ndash;  by -
  17 #            Put quotes round <quote> text
  18 #
  19 # -quoteliteral:
  20 #            Put quotes round <literal> text
  21 #
  22 # -bookinfo: Remove the <bookinfo> element from the file
  23 #
  24 # -fi:       Replace "fi" by &#xFB01; except when it is in an XML element, or
  25 #            inside a <literal>.
  26 #
  27 # -html:     Certain things are done only for HTML output:
  28 #
  29 #            If <literallayout> is followed by optional space and then a
  30 #            newline, the space and newline are removed, because otherwise you
  31 #            get a blank line in the HTML output.
  32 #
  33 # -noindex   Remove the XML that generates indexes.
  34 # -oneindex  Ditto, but add XML to generate a single index.
  35 #
  36 # -optbreak  Insert an optional line break (zero-width space, &#x200B;) after
  37 #            every underscore in text within <option> and <variable> elements,
  38 #            except when preceded by <entry> (i.e. not in tables). The same is
  39 #            also done within a word of four or more upper-case letters (for
  40 #            compile-time options).
  41
  42
  43
  44 # The function that processes non-literal, non-monospaced text
  45
  46 sub process()
  47 {
  48 my($s) = $_[0];
  49
  50 $s =~ s/fi(?![^<>]*>)/&#xFB01;/g if $ligatures;
  51
  52 if ($optbreak)
  53   {
  54   $s =~ s%(?<!<entry>)(<option>|<varname>)([^<]+)%
  55     my($x,$y) = ($1,$2); $y =~ s/_/_&#x200B;/g; "$x"."$y"%gex;
  56
  57   $s =~ s?\b([A-Z_]{4,})\b?
  58     my($x) = $1; $x =~ s/_/_&#x200B;/g; "$x"?gex;
  59   }
  60
  61 if ($ascii)
  62   {
  63   $s =~ s/&#x2019;/'/g;
  64   $s =~ s/&copy;/(c)/g;
  65   $s =~ s/&dagger;/*/g;
  66   $s =~ s/&Dagger;/**/g;
  67   $s =~ s/&nsbp;/ /g;
  68   $s =~ s/&ndash;/-/g;
  69   $s =~ s/<quote>/"/g;
  70   $s =~ s/<\/quote>/"/g;
  71   }
  72
  73 $s;
  74 }
  75
  76
  77 # The main program
  78
  79 $ascii     = 0;
  80 $bookinfo  = 0;
  81 $html      = 0;
  82 $inliteral = 0;
  83 $inliterallayout = 0;
  84 $ligatures = 0;
  85 $madeindex = 0;
  86 $noindex   = 0;
  87 $oneindex  = 0;
  88 $optbreak  = 0;
  89 $quoteliteral = 0;
  90
  91 foreach $arg (@ARGV)
  92   {
  93   if    ($arg eq "-fi")       { $ligatures = 1; }
  94   elsif ($arg eq "-ascii")    { $ascii = 1; }
  95   elsif ($arg eq "-bookinfo") { $bookinfo = 1; }
  96   elsif ($arg eq "-html")     { $html = 1; }
  97   elsif ($arg eq "-noindex")  { $noindex = 1; }
  98   elsif ($arg eq "-oneindex") { $oneindex = 1; }
  99   elsif ($arg eq "-optbreak") { $optbreak = 1; }
 100   elsif ($arg eq "-quoteliteral") { $quoteliteral = 1; }
 101   else  { die "** Pre-xml: Unknown option \"$arg\"\n"; }
 102   }
 103
 104 while (<STDIN>)
 105   {
 106   # Remove <bookinfo> if required
 107
 108   if ($bookinfo && /^<bookinfo/)
 109     {
 110     while (<STDIN>) { last if /^<\/bookinfo/; }
 111     next;
 112     }
 113
 114   # Copy monospaced literallayout blocks
 115
 116   if (/^<literallayout class="monospaced">/)
 117     {
 118     $_ = substr($_, 0, -1) if $html;
 119     print;
 120     while (<STDIN>)
 121       {
 122       print;
 123       last if /^<\/literallayout>/;
 124       }
 125     next;
 126     }
 127
 128   # Adjust index-generation code if required
 129
 130   if (($noindex || $oneindex) && /^<index[\s>]/)
 131     {
 132     while (<STDIN>)
 133       {
 134       last if /^<\/index>/;
 135       }
 136
 137     if ($oneindex && !$madeindex)
 138       {
 139       $madeindex = 1;
 140       print "<index><title>Index</title></index>\n";
 141       }
 142
 143     next;
 144     }
 145
 146   # A line that is not in a monospaced literal block; keep track of which
 147   # parts are in <literal> and which not. The latter get processed by the
 148   # function above. Items in <literal> get quoted unless they are also in
 149   # a <literallayout> block, or are already being quoted.
 150
 151   for (;;)
 152     {
 153     $_ = substr($_, 0, -1) if $html && /^<literallayout[^>]*>\s*\n$/;
 154     $inliterallayout = 1 if /^<literallayout/;
 155     $inliterallayout = 0 if /^<\/literallayout/;
 156
 157     if ($inliteral)
 158       {
 159       if (/^(.*?)<\/literal>(?!<\/quote>)(.*)$/)
 160         {
 161         print $1;
 162         print "\"" if $quoteliteral && !$inliterallayout;
 163         print "</literal>";
 164         $inliteral = 0;
 165         $_ = "$2\n";
 166         }
 167       else
 168         {
 169         print;
 170         last;
 171         }
 172       }
 173
 174     # Not in literal state
 175
 176     else
 177       {
 178       if (/^(.*?)(?<!<quote>)<literal>(.*)$/)
 179         {
 180         print &process($1);
 181         print "<literal>";
 182         print "\"" if $quoteliteral && !$inliterallayout;
 183         $inliteral = 1;
 184         $_ = "$2\n";
 185         }
 186       else
 187         {
 188         print &process($_);
 189         last;
 190         }
 191       }
 192     }    # Loop for different parts of one line
 193   }      # Loop for multiple lines
 194
 195 # End