doc/doc-scripts/fc2k

   1 #! /usr/bin/perl -w
   2
   3 # Script to read the HTML table of contents for the Exim FAQ and create an
   4 # HTML KWIC index out of it.
   5
   6
   7 ########################################################################
   8 # List of words to ignore - kept alphabetically for reference, but they
   9 # don't have to be in order.
  10
  11 $ignore_list = "
  12
  13 a ability able about absence access according actual address addresses addressed
  14 affect affected after against aka all allow allowed allows along already also
  15 although always am amount an ancient and and/or annoying another any anybody
  16 anyone anything anywhere apparent apparently are aren't around arrange arrive
  17 arrives as at
  18
  19 back bad based basically be because been behave behaviour being best between
  20 bob both box bug build builds built busy but by
  21
  22 call called calls can can't cannot causes causing central certain code comes
  23 coming command commands complain complaining complains configure configured
  24 conjunction contact contain contains contained correct correctly could
  25 currently customer
  26
  27 day days defined deliver delivers delivered delivery deliveries did do does
  28 doesn't doing don't down during
  29
  30 e-mail e-mails each easy either else email emails entirely entries entry
  31 especially etc even ever every example exim exim's experiencing
  32
  33 far few file files find finds fine fix fixed fly following for form found from
  34 fully
  35
  36 generate generated get gets getting given gives giving go goes going got
  37
  38 handle handles handled handling happen happens has have haven't having helpful
  39 him host hosts how however
  40
  41 i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
  42
  43 jim just
  44
  45 keep keeps know knows
  46
  47 like line lines look looked looking lot
  48
  49 m machine machines machine's mail mails main make me mean means message messages
  50 might more much must my myself
  51
  52 near need neither no nor not now
  53
  54 occur of off often ok on one only or other our out over own
  55
  56 part parts particular per place possibility possible present problem problems
  57 put puts
  58
  59 quite
  60
  61 raised rather really reason rid right round run runs
  62
  63 same say saying see seeing seem seems seen sees set setting she should simply
  64 sit so some somehow something sometimes stand state statement still strange such
  65 supposed system systems
  66
  67 take takes tell than that the their them then there these they things think this
  68 those thought to try though to/for told too tried tries trying
  69
  70 under until up use uses used using usually
  71
  72 valid value values via
  73
  74 want wanted wanting was way we we've well what what's when where whereabouts
  75 whenever whether which while who whose why will with within without wish won't
  76 wondered work worked working works would wrong
  77
  78 xxx
  79
  80 yet yyy
  81
  82 ";
  83 ########################################################################
  84
  85
  86 # The regular expression fragment that defines the separator between words
  87
  88 $wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
  89
  90
  91 ########################################################################
  92 # Function to add to a length to accommodate HTML stuff
  93
  94 sub setlen{
  95 my($len, $s) = @_;
  96
  97 $len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
  98 $len += 1 while ($s =~ /&#\d+;/g);
  99
 100 return $len;
 101 }
 102
 103
 104 ########################################################################
 105 # Function to write out the list of initials with references
 106
 107 sub write_initials {
 108 my($this_initial) = "$_[0]";
 109
 110 print OUT "<p>\n&nbsp;&nbsp;";
 111
 112 foreach $initial (sort keys %initials)
 113   {
 114   if ($initial eq $this_initial)
 115     {
 116     print OUT "&nbsp;&nbsp;&nbsp;<font size=7 color=\"#FF0A0A\"><b>$initial</b></font>&nbsp;";
 117     }
 118   else
 119     {
 120     print OUT "<a href=\"FAQ-KWIC_$initial.html\">&nbsp;&nbsp;$initial</a>";
 121     }
 122   }
 123
 124 print OUT "&nbsp;"x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
 125 }
 126
 127
 128
 129 ########################################################################
 130 # The main program. We can pick out the contents lines because they lie
 131 # between <li> and </li> in the file, sometimes on more than one physical
 132 # line.
 133
 134 # Turn the list of ignorable words into a hash for quick lookup. Add the
 135 # empty word to the list.
 136
 137 @words = split /\s+/, $ignore_list;
 138 foreach $word (@words) { $ignore{$word} = 1; }
 139 $ignore{""} = 1;
 140
 141
 142 # Open the file and do the job
 143
 144 open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
 145
 146 while (<IN>)
 147   {
 148   next unless /^<li>/;
 149   $_ .= <IN> while !/<\/li>$/;
 150   chomp;
 151   s/\n\s*/ /g;
 152
 153   # Extract the operative text into $text, with the beginning in $pre.
 154
 155   my($pre,$text,$post) = /^<li>(.*<\/a>:(?:&nbsp;)*)(.*)<br><br><\/li>$/;
 156
 157   # Now split into words. As well as punctuation, there may be HTML thingies
 158   # between words. Absorb them into the separators.
 159
 160   my(@words) = split /$wordgap/, $text;
 161
 162   # Lower case all the words, and remove those that we don't want.
 163   # Then keep a list of all the used initials.
 164
 165   REMOVE_IGNORE:
 166   for ($i = 0; $i < scalar @words; $i++)
 167     {
 168     my($word) = $words[$i] = "\L$words[$i]\E";
 169
 170     # Remove certain forms of word and those on the ignore list
 171
 172     if (defined $ignore{$word} ||  # word on ignore list
 173         $word =~ /^-+$/        ||  # word consists entirely of hyphens
 174         $word =~ /^-[^a-z]/    ||  # follows leading hyphen with non-letter
 175         $word =~ /^[^a-z-]/    ||  # starts with a non-letter or hyphen
 176         $word =~ /[@^.]/           # contains @ or ^ or .
 177        )
 178       {
 179       splice(@words, $i, 1);
 180       redo REMOVE_IGNORE if $i < scalar @words;
 181       }
 182
 183     # Otherwise, build up a list of initials
 184
 185     else
 186       {
 187       my($inword) = $word;
 188       $inword =~ s/^-//;
 189       $initial = substr($inword, 0, 1);
 190       $initials{"\U$initial\E"} = 1;
 191       }
 192     }
 193
 194   # Create the lines for the KWIC index, and store them in associative
 195   # arrays, with the keyword as the key. That will get them sorted
 196   # automatically.
 197
 198   while (scalar @words > 0)
 199     {
 200     my($word) = shift @words;
 201     my($pretext, $casedword, $posttext) =
 202       $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
 203
 204     # Remove a leading hyphen from $word so that it sorts according to
 205     # the leading letter. What is actually output is $casedword, which
 206     # retains the hyphen.
 207
 208     $word =~ s/^-//;
 209
 210     my($prelen) = length $pretext;
 211     my($postlen) = length $posttext;
 212
 213     # We want to chop excessively long entries on either side. We can't set
 214     # a fixed length because of the HTML control data. Call a function to
 215     # add the given length to allow for HTML stuff. This is crude, but it
 216     # does roughly the right thing.
 217
 218     my($leftlen) = &setlen(70, $pretext);
 219     my($rightlen) = &setlen(70, $posttext);
 220
 221     if ($prelen > $leftlen)
 222       {
 223       my($cutoff) = $leftlen;
 224       $cutoff++
 225         while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
 226       $pretext = "... " . substr($pretext, -$cutoff);
 227       }
 228
 229     if ($postlen > $rightlen)
 230       {
 231       my($cutoff) = $rightlen;
 232       $cutoff++
 233         while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
 234       $posttext = substr($posttext, 0, $cutoff) . "...";
 235       }
 236
 237     # If the pre text has a font-ending not preceded by a font beginning
 238     # (i.e. we've chopped the beginning off), we must insert a beginning.
 239
 240     while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
 241       {
 242       $pretext = "<$2>" . $pretext;
 243       }
 244
 245     # If the pre text ends in a special font, we have to terminate that,
 246     # and reset it at the start of the post text.
 247
 248     my($poststart) = "";
 249
 250     while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
 251       {
 252       $pretext .= "</$1>";
 253       $poststart .= "<$1>";
 254       }
 255
 256     # If the post text changes font but doesn't close it, we must add
 257     # the closure.
 258
 259     while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
 260       {
 261       $posttext .= "</$1>";
 262       }
 263
 264     # Remove any unnecessary changes in either of them
 265
 266     $pretext  =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
 267     $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
 268
 269     # Save the texts in associative arrays. Add the question number to
 270     # the end of the word to make the key.
 271
 272     $pre =~ /(Q\d\d\d\d)/;
 273     my($key) = "$word-$1";
 274
 275     $tableft{$key}  = $pre . $pretext;
 276     $tabright{$key} = $poststart .
 277       "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
 278     }
 279   }
 280
 281 close(IN);
 282
 283 # Now write out the files. Each letter in the index goes in a different file
 284
 285 $current_initial = "";
 286
 287 foreach $key (sort keys %tableft)
 288   {
 289   my($initial) = $key =~ /^(.)/;
 290   $initial = "\U$initial\E";
 291
 292   if ($initial ne $current_initial)
 293     {
 294     if ($current_initial ne "")
 295       {
 296       print OUT "</table>\n";
 297       &write_initials($current_initial);
 298       print OUT "</body>\n</html>\n";
 299       close OUT;
 300       }
 301
 302     open (OUT, ">html/FAQ-KWIC_$initial.html") ||
 303       die "Can't open html/FAQ-KWIC_$initial.html\n";
 304     print OUT
 305       "<html>\n" .
 306       "<head>\n" .
 307       "<title>Exim FAQ: KWIC index section $initial</title>\n" .
 308       "</head>\n" .
 309       "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
 310       "<h1>Exim FAQ: Keyword-in-context index</h1>\n";
 311
 312     write_initials($initial);
 313
 314     if ($initial eq "A")
 315       {
 316       print OUT <<End ;
 317 <p>
 318 This <i>Keyword-in-context</i> index for the Exim FAQ is generated
 319 automatically from the FAQ source. Browsers may not display the data very
 320 prettily, but it is hoped that it may provide a useful aid for finding things
 321 in the FAQ.
 322 </p>
 323 End
 324       }
 325
 326     print OUT "<table border>\n";
 327     $current_initial = $initial;
 328     }
 329
 330   print OUT "<tr>\n";
 331   print OUT "<td align=\"right\">$tableft{$key}</td>\n";
 332   print OUT "<td align=\"left\">$tabright{$key}</td>\n";
 333   print OUT "</tr>\n";
 334   }
 335
 336 # Close the final file
 337
 338 if ($current_initial ne "")
 339   {
 340   print OUT "</table>\n";
 341   &write_initials($current_initial);
 342   print OUT "</body>\n</html>\n";
 343   close OUT;
 344   }
 345
 346 # End