doc/doc-docbook/TidyHTML-filter

   1 #! /usr/bin/perl
   2
   3 # $Cambridge: exim/doc/doc-docbook/TidyHTML-filter,v 1.3 2006/02/01 11:01:01 ph10 Exp $
   4
   5 # Script to tidy up the filter HTML file that is generated by xmlto. The
   6 # following changes are made:
   7 #
   8 # 1. Split very long lines.
   9 # 2. Create reverse links from chapter and section titles back to the TOC.
  10 # 3. Turn <div class="literallayout"><p> into <div class="literallayout"> and
  11 #    a matching </p></div> into </div> to get rid of unwanted vertical white
  12 #    space.
  13
  14
  15 $tocref = 1;
  16 $thisdiv = 0;
  17
  18 # Read in the filter.html file.
  19
  20 open(IN, "filter.html") || die "Failed to open filter.html for reading: $!\n";
  21 @text = <IN>;
  22 close(IN);
  23
  24 # Insert a newline after every > in the toc, because the whole toc is generated
  25 # as one humungous line that is hard to check. Indeed, the start of the first
  26 # chapter is also on the line, so we have to split if off first. Having
  27 # inserted newlines, we split the toc into separate items in the vector.
  28
  29 for ($i = 0; $i < scalar(@text); $i++)
  30   {
  31   if ($text[$i] =~ ?<title>Exim's interfaces to mail filtering</title>?)
  32     {
  33     splice @text, $i, 1, (split /(?=<div class="chapter")/, $text[$i]);
  34     $text[$i] =~ s/>\s*/>\n/g;
  35     splice @text, $i, 1, (split /(?<=\n)/, $text[$i]);
  36     last;
  37     }
  38   }
  39
  40 # We want to create reverse links from each chapter and section title back to
  41 # the relevant place in the TOC. Scan the TOC for the relevant entries. Add
  42 # an id to each entry, and create tables that remember the new link ids. We
  43 # detect the start of the TOC by <div class="toc" and the end of the TOC by
  44 # <div class="chapter".
  45
  46 # Skip to start of TOC
  47
  48 for ($i = 0; $i < scalar(@text); $i++)
  49   {
  50   last if $text[$i] =~ /^<div class="toc"/;
  51   }
  52
  53 # Scan the TOC
  54
  55 for (; $i < scalar(@text); $i++)
  56   {
  57   last if $text[$i] =~ /^<div class="chapter"/;
  58   if ($text[$i] =~ /^<a href="(#[^"]+)">/)
  59     {
  60     my($ss) = $1;
  61     my($id) = sprintf "%04d", $tocref++;
  62     $text[$i] =~ s/<a/<a id="toc$id"/;
  63     $backref{"$ss"} = "toc$id";
  64     }
  65   }
  66
  67 # Scan remainder of the document
  68
  69 for (; $i < scalar(@text); $i++)
  70   {
  71   while ($text[$i] =~
  72       /^(.*)<a( xmlns="[^"]+")? id="([^"]+)"><\/a>(.*?)<\/h(.*)/)
  73     {
  74     my($ref) = $backref{"#$2"};
  75     $text[$i] = "$1<a$2 href=\"#$ref\" id=\"$3\">$4</a></h$5";
  76     }
  77
  78   if ($text[$i] =~ /^(.*)<div class="literallayout"><p>(?:<br \/>)?(.*)/)
  79     {
  80     my($j);
  81     $text[$i] = "$1<div class=\"literallayout\">$2";
  82
  83     for ($j = $i + 1; $j < scalar(@text); $j++)
  84       {
  85       if ($text[$j] =~ /^<\/p><\/div>/)
  86         {
  87         $text[$j] =~ s/<\/p>//;
  88         last;
  89         }
  90       }
  91     }
  92   }
  93
  94 # Write out the revised file
  95
  96 open(OUT, ">filter.html") || die "Failed to open filter.html for writing: $!\n";
  97 print OUT @text;
  98 close(OUT);
  99
 100 # End