3 # Script to read the HTML table of contents for the Exim FAQ and create an
4 # HTML KWIC index out of it.
7 ########################################################################
8 # List of words to ignore - kept alphabetically for reference, but they
9 # don't have to be in order.
13 a ability able about absence access according actual address addresses addressed
14 affect affected after against aka all allow allowed allows along already also
15 although always am amount an ancient and and/or annoying another any anybody
16 anyone anything anywhere apparent apparently are aren't around arrange arrive
19 back bad based basically be because been behave behaviour being best between
20 bob both box bug build builds built busy but by
22 call called calls can can't cannot causes causing central certain code comes
23 coming command commands complain complaining complains configure configured
24 conjunction contact contain contains contained correct correctly could
27 day days defined deliver delivers delivered delivery deliveries did do does
28 doesn't doing don't down during
30 e-mail e-mails each easy either else email emails entirely entries entry
31 especially etc even ever every example exim exim's experiencing
33 far few file files find finds fine fix fixed fly following for form found from
36 generate generated get gets getting given gives giving go goes going got
38 handle handles handled handling happen happens has have haven't having helpful
39 him host hosts how however
41 i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
47 like line lines look looked looking lot
49 m machine machines machine's mail mails main make me mean means message messages
50 might more much must my myself
52 near need neither no nor not now
54 occur of off often ok on one only or other our out over own
56 part parts particular per place possibility possible present problem problems
61 raised rather really reason rid right round run runs
63 same say saying see seeing seem seems seen sees set setting she should simply
64 sit so some somehow something sometimes stand state statement still strange such
65 supposed system systems
67 take takes tell than that the their them then there these they things think this
68 those thought to try though to/for told too tried tries trying
70 under until up use uses used using usually
72 valid value values via
74 want wanted wanting was way we we've well what what's when where whereabouts
75 whenever whether which while who whose why will with within without wish won't
76 wondered work worked working works would wrong
83 ########################################################################
86 # The regular expression fragment that defines the separator between words
88 $wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
91 ########################################################################
92 # Function to add to a length to accommodate HTML stuff
97 $len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
98 $len += 1 while ($s =~ /&#\d+;/g);
104 ########################################################################
105 # Function to write out the list of initials with references
108 my($this_initial) = "$_[0]";
110 print OUT "<p>\n ";
112 foreach $initial (sort keys %initials)
114 if ($initial eq $this_initial)
116 print OUT " <font size=7 color=\"#FF0A0A\"><b>$initial</b></font> ";
120 print OUT "<a href=\"FAQ-KWIC_$initial.html\"> $initial</a>";
124 print OUT " "x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
129 ########################################################################
130 # The main program. We can pick out the contents lines because they lie
131 # between <li> and </li> in the file, sometimes on more than one physical
134 # Turn the list of ignorable words into a hash for quick lookup. Add the
135 # empty word to the list.
137 @words = split /\s+/, $ignore_list;
138 foreach $word (@words) { $ignore{$word} = 1; }
142 # Open the file and do the job
144 open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
149 $_ .= <IN> while !/<\/li>$/;
153 # Extract the operative text into $text, with the beginning in $pre.
155 my($pre,$text,$post) = /^<li>(.*<\/a>:(?: )*)(.*)<br><br><\/li>$/;
157 # Now split into words. As well as punctuation, there may be HTML thingies
158 # between words. Absorb them into the separators.
160 my(@words) = split /$wordgap/, $text;
162 # Lower case all the words, and remove those that we don't want.
163 # Then keep a list of all the used initials.
166 for ($i = 0; $i < scalar @words; $i++)
168 my($word) = $words[$i] = "\L$words[$i]\E";
170 # Remove certain forms of word and those on the ignore list
172 if (defined $ignore{$word} || # word on ignore list
173 $word =~ /^-+$/ || # word consists entirely of hyphens
174 $word =~ /^-[^a-z]/ || # follows leading hyphen with non-letter
175 $word =~ /^[^a-z-]/ || # starts with a non-letter or hyphen
176 $word =~ /[@^.]/ # contains @ or ^ or .
179 splice(@words, $i, 1);
180 redo REMOVE_IGNORE if $i < scalar @words;
183 # Otherwise, build up a list of initials
189 $initial = substr($inword, 0, 1);
190 $initials{"\U$initial\E"} = 1;
194 # Create the lines for the KWIC index, and store them in associative
195 # arrays, with the keyword as the key. That will get them sorted
198 while (scalar @words > 0)
200 my($word) = shift @words;
201 my($pretext, $casedword, $posttext) =
202 $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
204 # Remove a leading hyphen from $word so that it sorts according to
205 # the leading letter. What is actually output is $casedword, which
206 # retains the hyphen.
210 my($prelen) = length $pretext;
211 my($postlen) = length $posttext;
213 # We want to chop excessively long entries on either side. We can't set
214 # a fixed length because of the HTML control data. Call a function to
215 # add the given length to allow for HTML stuff. This is crude, but it
216 # does roughtly the right thing.
218 my($leftlen) = &setlen(70, $pretext);
219 my($rightlen) = &setlen(70, $posttext);
221 if ($prelen > $leftlen)
223 my($cutoff) = $leftlen;
225 while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
226 $pretext = "... " . substr($pretext, -$cutoff);
229 if ($postlen > $rightlen)
231 my($cutoff) = $rightlen;
233 while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
234 $posttext = substr($posttext, 0, $cutoff) . "...";
237 # If the pre text has a font-ending not preceded by a font beginning
238 # (i.e. we've chopped the beginning off), we must insert a beginning.
240 while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
242 $pretext = "<$2>" . $pretext;
245 # If the pre text ends in a special font, we have to terminate that,
246 # and reset it at the start of the post text.
250 while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
253 $poststart .= "<$1>";
256 # If the post text changes font but doesn't close it, we must add
259 while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
261 $posttext .= "</$1>";
264 # Remove any unnecessary changes in either of them
266 $pretext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
267 $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
269 # Save the texts in associative arrays. Add the question number to
270 # the end of the word to make the key.
272 $pre =~ /(Q\d\d\d\d)/;
273 my($key) = "$word-$1";
275 $tableft{$key} = $pre . $pretext;
276 $tabright{$key} = $poststart .
277 "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
283 # Now write out the files. Each letter in the index goes in a different file
285 $current_initial = "";
287 foreach $key (sort keys %tableft)
289 my($initial) = $key =~ /^(.)/;
290 $initial = "\U$initial\E";
292 if ($initial ne $current_initial)
294 if ($current_initial ne "")
296 print OUT "</table>\n";
297 &write_initials($current_initial);
298 print OUT "</body>\n</html>\n";
302 open (OUT, ">html/FAQ-KWIC_$initial.html") ||
303 die "Can't open html/FAQ-KWIC_$initial.html\n";
307 "<title>Exim FAQ: KWIC index section $initial</title>\n" .
309 "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
310 "<h1>Exim FAQ: Keyword-in-context index</h1>\n";
312 write_initials($initial);
318 This <i>Keyword-in-context</i> index for the Exim FAQ is generated
319 automatically from the FAQ source. Browsers may not display the data very
320 prettily, but it is hoped that it may provide a useful aid for finding things
326 print OUT "<table border>\n";
327 $current_initial = $initial;
331 print OUT "<td align=\"right\">$tableft{$key}</td>\n";
332 print OUT "<td align=\"left\">$tabright{$key}</td>\n";
336 # Close the final file
338 if ($current_initial ne "")
340 print OUT "</table>\n";
341 &write_initials($current_initial);
342 print OUT "</body>\n</html>\n";