2 # $Cambridge: exim/doc/doc-scripts/fc2k,v 1.2 2004/10/14 09:53:11 ph10 Exp $
4 # Script to read the HTML table of contents for the Exim FAQ and create an
5 # HTML KWIC index out of it.
8 ########################################################################
9 # List of words to ignore - kept alphabetically for reference, but they
10 # don't have to be in order.
14 a ability able about absence access according actual address addresses addressed
15 affect affected after against aka all allow allowed allows along already also
16 although always am amount an ancient and and/or annoying another any anybody
17 anyone anything anywhere apparent apparently are aren't around arrange arrive
20 back bad based basically be because been behave behaviour being best between
21 bob both box bug build builds built busy but by
23 call called calls can can't cannot causes causing central certain code comes
24 coming command commands complain complaining complains configure configured
25 conjunction contact contain contains contained correct correctly could
28 day days defined deliver delivers delivered delivery deliveries did do does
29 doesn't doing don't down during
31 e-mail e-mails each easy either else email emails entirely entries entry
32 especially etc even ever every example exim exim's experiencing
34 far few file files find finds fine fix fixed fly following for form found from
37 generate generated get gets getting given gives giving go goes going got
39 handle handles handled handling happen happens has have haven't having helpful
40 him host hosts how however
42 i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
48 like line lines look looked looking lot
50 m machine machines machine's mail mails main make me mean means message messages
51 might more much must my myself
53 near need neither no nor not now
55 occur of off often ok on one only or other our out over own
57 part parts particular per place possibility possible present problem problems
62 raised rather really reason rid right round run runs
64 same say saying see seeing seem seems seen sees set setting she should simply
65 sit so some somehow something sometimes stand state statement still strange such
66 supposed system systems
68 take takes tell than that the their them then there these they things think this
69 those thought to try though to/for told too tried tries trying
71 under until up use uses used using usually
73 valid value values via
75 want wanted wanting was way we we've well what what's when where whereabouts
76 whenever whether which while who whose why will with within without wish won't
77 wondered work worked working works would wrong
84 ########################################################################
87 # The regular expression fragment that defines the separator between words
89 $wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
92 ########################################################################
93 # Function to add to a length to accommodate HTML stuff
98 $len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
99 $len += 1 while ($s =~ /&#\d+;/g);
105 ########################################################################
106 # Function to write out the list of initials with references
109 my($this_initial) = "$_[0]";
111 print OUT "<p>\n ";
113 foreach $initial (sort keys %initials)
115 if ($initial eq $this_initial)
117 print OUT " <font size=7 color=\"#FF0A0A\"><b>$initial</b></font> ";
121 print OUT "<a href=\"FAQ-KWIC_$initial.html\"> $initial</a>";
125 print OUT " "x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
130 ########################################################################
131 # The main program. We can pick out the contents lines because they lie
132 # between <li> and </li> in the file, sometimes on more than one physical
135 # Turn the list of ignorable words into a hash for quick lookup. Add the
136 # empty word to the list.
138 @words = split /\s+/, $ignore_list;
139 foreach $word (@words) { $ignore{$word} = 1; }
143 # Open the file and do the job
145 open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
150 $_ .= <IN> while !/<\/li>$/;
154 # Extract the operative text into $text, with the beginning in $pre.
156 my($pre,$text,$post) = /^<li>(.*<\/a>:(?: )*)(.*)<br><br><\/li>$/;
158 # Now split into words. As well as punctuation, there may be HTML thingies
159 # between words. Absorb them into the separators.
161 my(@words) = split /$wordgap/, $text;
163 # Lower case all the words, and remove those that we don't want.
164 # Then keep a list of all the used initials.
167 for ($i = 0; $i < scalar @words; $i++)
169 my($word) = $words[$i] = "\L$words[$i]\E";
171 # Remove certain forms of word and those on the ignore list
173 if (defined $ignore{$word} || # word on ignore list
174 $word =~ /^-+$/ || # word consists entirely of hyphens
175 $word =~ /^-[^a-z]/ || # follows leading hyphen with non-letter
176 $word =~ /^[^a-z-]/ || # starts with a non-letter or hyphen
177 $word =~ /[@^.]/ # contains @ or ^ or .
180 splice(@words, $i, 1);
181 redo REMOVE_IGNORE if $i < scalar @words;
184 # Otherwise, build up a list of initials
190 $initial = substr($inword, 0, 1);
191 $initials{"\U$initial\E"} = 1;
195 # Create the lines for the KWIC index, and store them in associative
196 # arrays, with the keyword as the key. That will get them sorted
199 while (scalar @words > 0)
201 my($word) = shift @words;
202 my($pretext, $casedword, $posttext) =
203 $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
205 # Remove a leading hyphen from $word so that it sorts according to
206 # the leading letter. What is actually output is $casedword, which
207 # retains the hyphen.
211 my($prelen) = length $pretext;
212 my($postlen) = length $posttext;
214 # We want to chop excessively long entries on either side. We can't set
215 # a fixed length because of the HTML control data. Call a function to
216 # add the given length to allow for HTML stuff. This is crude, but it
217 # does roughtly the right thing.
219 my($leftlen) = &setlen(70, $pretext);
220 my($rightlen) = &setlen(70, $posttext);
222 if ($prelen > $leftlen)
224 my($cutoff) = $leftlen;
226 while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
227 $pretext = "... " . substr($pretext, -$cutoff);
230 if ($postlen > $rightlen)
232 my($cutoff) = $rightlen;
234 while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
235 $posttext = substr($posttext, 0, $cutoff) . "...";
238 # If the pre text has a font-ending not preceded by a font beginning
239 # (i.e. we've chopped the beginning off), we must insert a beginning.
241 while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
243 $pretext = "<$2>" . $pretext;
246 # If the pre text ends in a special font, we have to terminate that,
247 # and reset it at the start of the post text.
251 while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
254 $poststart .= "<$1>";
257 # If the post text changes font but doesn't close it, we must add
260 while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
262 $posttext .= "</$1>";
265 # Remove any unnecessary changes in either of them
267 $pretext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
268 $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
270 # Save the texts in associative arrays. Add the question number to
271 # the end of the word to make the key.
273 $pre =~ /(Q\d\d\d\d)/;
274 my($key) = "$word-$1";
276 $tableft{$key} = $pre . $pretext;
277 $tabright{$key} = $poststart .
278 "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
284 # Now write out the files. Each letter in the index goes in a different file
286 $current_initial = "";
288 foreach $key (sort keys %tableft)
290 my($initial) = $key =~ /^(.)/;
291 $initial = "\U$initial\E";
293 if ($initial ne $current_initial)
295 if ($current_initial ne "")
297 print OUT "</table>\n";
298 &write_initials($current_initial);
299 print OUT "</body>\n</html>\n";
303 open (OUT, ">html/FAQ-KWIC_$initial.html") ||
304 die "Can't open html/FAQ-KWIC_$initial.html\n";
308 "<title>Exim FAQ: KWIC index section $initial</title>\n" .
310 "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
311 "<h1>Exim FAQ: Keyword-in-context index</h1>\n";
313 write_initials($initial);
319 This <i>Keyword-in-context</i> index for the Exim FAQ is generated
320 automatically from the FAQ source. Browsers may not display the data very
321 prettily, but it is hoped that it may provide a useful aid for finding things
327 print OUT "<table border>\n";
328 $current_initial = $initial;
332 print OUT "<td align=\"right\">$tableft{$key}</td>\n";
333 print OUT "<td align=\"left\">$tabright{$key}</td>\n";
337 # Close the final file
339 if ($current_initial ne "")
341 print OUT "</table>\n";
342 &write_initials($current_initial);
343 print OUT "</body>\n</html>\n";