doc/doc-scripts/fc2k

   1 #! /usr/bin/perl -w
   2 # $Cambridge: exim/doc/doc-scripts/fc2k,v 1.1 2004/10/07 15:04:35 ph10 Exp $
   3
   4 # Script to read the HTML table of contents for the Exim FAQ and create an
   5 # HTML KWIC index out of it.
   6
   7
   8 ########################################################################
   9 # List of words to ignore - kept alphabetically for reference, but they
  10 # don't have to be in order.
  11
  12 $ignore_list = "
  13
  14 a ability able about address addresses addressed affect affected
  15 after against all allow allowed allows already also although always am an and
  16 and/or any anybody anyone anything anywhere are aren't arrange arrive as at
  17
  18 back bad based basically be because been behave behaviour being best between
  19 bob both bug build builds built busy but by
  20
  21 call called calls can can't cannot causes causing central certain code comes
  22 coming command commands complain complaining complains configure configured
  23 conjunction contact contain contains contained correct correctly could
  24 currently customer
  25
  26 day days defined deliver delivers delivered delivery deliveries did do does
  27 doesn't doing don't down during
  28
  29 e-mail e-mails each easy else email emails entirely entries entry especially
  30 etc even ever every example exim exim's experiencing
  31
  32 far few file files find fine fly following for form found from fully
  33
  34 get gets getting given gives giving go goes going got
  35
  36 handle handles handled handling happen happens has have haven't having helpful
  37 him host hosts how however
  38
  39 i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
  40
  41 jim just
  42
  43 keep keeps know knows
  44
  45 like line lines look looked looking lot
  46
  47 machine machines machine's mail mails main make me mean means message messages
  48 might more must my myself
  49
  50 near need neither no nor not now
  51
  52 occur of off often ok on one only or other our out over own
  53
  54 part parts particular per place possibility possible present problem problems
  55 put puts
  56
  57 quite
  58
  59 raised rather really reason rid right round run runs
  60
  61 same say saying see seeing seem seems seen sees set setting she should so some
  62 somehow something sometimes stand state statement still strange such supposed
  63 system systems
  64
  65 take takes than that the their them then there these they things think this
  66 those to try though to/for told too tried tries trying
  67
  68 under until up use uses used using usually
  69
  70 valid value values via
  71
  72 want wanted wanting was way we we've well what what's when where whereabouts
  73 whenever whether which while who whose why will with within without wish won't
  74 wondered work worked working works would wrong
  75
  76 xxx
  77
  78 yet yyy
  79
  80 ";
  81 ########################################################################
  82
  83
  84 # The regular expression fragment that defines the separator between words
  85
  86 $wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
  87
  88
  89 ########################################################################
  90 # Function to add to a length to accommodate HTML stuff
  91
  92 sub setlen{
  93 my($len, $s) = @_;
  94
  95 $len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
  96 $len += 1 while ($s =~ /&#\d+;/g);
  97
  98 return $len;
  99 }
 100
 101
 102 ########################################################################
 103 # Function to write out the list of initials with references
 104
 105 sub write_initials {
 106 my($this_initial) = "$_[0]";
 107
 108 print OUT "<p>\n&nbsp;&nbsp;";
 109
 110 foreach $initial (sort keys %initials)
 111   {
 112   if ($initial eq $this_initial)
 113     {
 114     print OUT "&nbsp;&nbsp;&nbsp;<font size=7 color=\"#FF0A0A\"><b>$initial</b></font>&nbsp;";
 115     }
 116   else
 117     {
 118     print OUT "<a href=\"FAQ-KWIC_$initial.html\">&nbsp;&nbsp;$initial</a>";
 119     }
 120   }
 121
 122 print OUT "&nbsp;"x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
 123 }
 124
 125
 126
 127 ########################################################################
 128 # The main program. We can pick out the contents lines because they lie
 129 # between <li> and </li> in the file, sometimes on more than one physical
 130 # line.
 131
 132 # Turn the list of ignorable words into a hash for quick lookup. Add the
 133 # empty word to the list.
 134
 135 @words = split /\s+/, $ignore_list;
 136 foreach $word (@words) { $ignore{$word} = 1; }
 137 $ignore{""} = 1;
 138
 139
 140 # Open the file and do the job
 141
 142 open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
 143
 144 while (<IN>)
 145   {
 146   next unless /^<li>/;
 147   $_ .= <IN> while !/<\/li>$/;
 148   chomp;
 149   s/\n\s*/ /g;
 150
 151   # Extract the operative text into $text, with the beginning in $pre.
 152
 153   my($pre,$text,$post) = /^<li>(.*<\/a>:(?:&nbsp;)*)(.*)<br><br><\/li>$/;
 154
 155   # Now split into words. As well as punctuation, there may be HTML thingies
 156   # between words. Absorb them into the separators.
 157
 158   my(@words) = split /$wordgap/, $text;
 159
 160   # Lower case all the words, and remove those that we don't want.
 161   # Then keep a list of all the used initials.
 162
 163   REMOVE_IGNORE:
 164   for ($i = 0; $i < scalar @words; $i++)
 165     {
 166     my($word) = $words[$i] = "\L$words[$i]\E";
 167
 168     # Remove certain forms of word and those on the ignore list
 169
 170     if (defined $ignore{$word} ||  # word on ignore list
 171         $word =~ /^-+$/        ||  # word consists entirely of hyphens
 172         $word =~ /^-[^a-z]/    ||  # follows leading hyphen with non-letter
 173         $word =~ /^[^a-z-]/    ||  # starts with a non-letter or hyphen
 174         $word =~ /[@^.]/           # contains @ or ^ or .
 175        )
 176       {
 177       splice(@words, $i, 1);
 178       redo REMOVE_IGNORE if $i < scalar @words;
 179       }
 180
 181     # Otherwise, build up a list of initials
 182
 183     else
 184       {
 185       my($inword) = $word;
 186       $inword =~ s/^-//;
 187       $initial = substr($inword, 0, 1);
 188       $initials{"\U$initial\E"} = 1;
 189       }
 190     }
 191
 192   # Create the lines for the KWIC index, and store them in associative
 193   # arrays, with the keyword as the key. That will get them sorted
 194   # automatically.
 195
 196   while (scalar @words > 0)
 197     {
 198     my($word) = shift @words;
 199     my($pretext, $casedword, $posttext) =
 200       $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
 201
 202     # Remove a leading hyphen from $word so that it sorts according to
 203     # the leading letter. What is actually output is $casedword, which
 204     # retains the hyphen.
 205
 206     $word =~ s/^-//;
 207
 208     my($prelen) = length $pretext;
 209     my($postlen) = length $posttext;
 210
 211     # We want to chop excessively long entries on either side. We can't set
 212     # a fixed length because of the HTML control data. Call a function to
 213     # add the given length to allow for HTML stuff. This is crude, but it
 214     # does roughtly the right thing.
 215
 216     my($leftlen) = &setlen(70, $pretext);
 217     my($rightlen) = &setlen(70, $posttext);
 218
 219     if ($prelen > $leftlen)
 220       {
 221       my($cutoff) = $leftlen;
 222       $cutoff++
 223         while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
 224       $pretext = "... " . substr($pretext, -$cutoff);
 225       }
 226
 227     if ($postlen > $rightlen)
 228       {
 229       my($cutoff) = $rightlen;
 230       $cutoff++
 231         while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
 232       $posttext = substr($posttext, 0, $cutoff) . "...";
 233       }
 234
 235     # If the pre text has a font-ending not preceded by a font beginning
 236     # (i.e. we've chopped the beginning off), we must insert a beginning.
 237
 238     while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
 239       {
 240       $pretext = "<$2>" . $pretext;
 241       }
 242
 243     # If the pre text ends in a special font, we have to terminate that,
 244     # and reset it at the start of the post text.
 245
 246     my($poststart) = "";
 247
 248     while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
 249       {
 250       $pretext .= "</$1>";
 251       $poststart .= "<$1>";
 252       }
 253
 254     # If the post text changes font but doesn't close it, we must add
 255     # the closure.
 256
 257     while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
 258       {
 259       $posttext .= "</$1>";
 260       }
 261
 262     # Remove any unnecessary changes in either of them
 263
 264     $pretext  =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
 265     $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
 266
 267     # Save the texts in associative arrays. Add the question number to
 268     # the end of the word to make the key.
 269
 270     $pre =~ /(Q\d\d\d\d)/;
 271     my($key) = "$word-$1";
 272
 273     $tableft{$key}  = $pre . $pretext;
 274     $tabright{$key} = $poststart .
 275       "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
 276     }
 277   }
 278
 279 close(IN);
 280
 281 # Now write out the files. Each letter in the index goes in a different file
 282
 283 $current_initial = "";
 284
 285 foreach $key (sort keys %tableft)
 286   {
 287   my($initial) = $key =~ /^(.)/;
 288   $initial = "\U$initial\E";
 289
 290   if ($initial ne $current_initial)
 291     {
 292     if ($current_initial ne "")
 293       {
 294       print OUT "</table>\n";
 295       &write_initials($current_initial);
 296       print OUT "</body>\n</html>\n";
 297       close OUT;
 298       }
 299
 300     open (OUT, ">html/FAQ-KWIC_$initial.html") ||
 301       die "Can't open html/FAQ-KWIC_$initial.html\n";
 302     print OUT
 303       "<html>\n" .
 304       "<head>\n" .
 305       "<title>Exim FAQ: KWIC index section $initial</title>\n" .
 306       "</head>\n" .
 307       "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
 308       "<h1>Exim FAQ: Keyword-in-context index</h1>\n";
 309
 310     write_initials($initial);
 311
 312     if ($initial eq "A")
 313       {
 314       print OUT <<End ;
 315 <p>
 316 This <i>Keyword-in-context</i> index for the Exim FAQ is generated
 317 automatically from the FAQ source. Browsers may not display the data very
 318 prettily, but it is hoped that it may provide a useful aid for finding things
 319 in the FAQ.
 320 </p>
 321 End
 322       }
 323
 324     print OUT "<table border>\n";
 325     $current_initial = $initial;
 326     }
 327
 328   print OUT "<tr>\n";
 329   print OUT "<td align=\"right\">$tableft{$key}</td>\n";
 330   print OUT "<td align=\"left\">$tabright{$key}</td>\n";
 331   print OUT "</tr>\n";
 332   }
 333
 334 # Close the final file
 335
 336 if ($current_initial ne "")
 337   {
 338   print OUT "</table>\n";
 339   &write_initials($current_initial);
 340   print OUT "</body>\n</html>\n";
 341   close OUT;
 342   }
 343
 344 # End