2 # $Cambridge: exim/doc/doc-scripts/fc2k,v 1.1 2004/10/07 15:04:35 ph10 Exp $
4 # Script to read the HTML table of contents for the Exim FAQ and create an
5 # HTML KWIC index out of it.
8 ########################################################################
9 # List of words to ignore - kept alphabetically for reference, but they
10 # don't have to be in order.
14 a ability able about address addresses addressed affect affected
15 after against all allow allowed allows already also although always am an and
16 and/or any anybody anyone anything anywhere are aren't arrange arrive as at
18 back bad based basically be because been behave behaviour being best between
19 bob both bug build builds built busy but by
21 call called calls can can't cannot causes causing central certain code comes
22 coming command commands complain complaining complains configure configured
23 conjunction contact contain contains contained correct correctly could
26 day days defined deliver delivers delivered delivery deliveries did do does
27 doesn't doing don't down during
29 e-mail e-mails each easy else email emails entirely entries entry especially
30 etc even ever every example exim exim's experiencing
32 far few file files find fine fly following for form found from fully
34 get gets getting given gives giving go goes going got
36 handle handles handled handling happen happens has have haven't having helpful
37 him host hosts how however
39 i i'd i'm i've if in indeed instead into is issue issues isn't it it's its
45 like line lines look looked looking lot
47 machine machines machine's mail mails main make me mean means message messages
48 might more must my myself
50 near need neither no nor not now
52 occur of off often ok on one only or other our out over own
54 part parts particular per place possibility possible present problem problems
59 raised rather really reason rid right round run runs
61 same say saying see seeing seem seems seen sees set setting she should so some
62 somehow something sometimes stand state statement still strange such supposed
65 take takes than that the their them then there these they things think this
66 those to try though to/for told too tried tries trying
68 under until up use uses used using usually
70 valid value values via
72 want wanted wanting was way we we've well what what's when where whereabouts
73 whenever whether which while who whose why will with within without wish won't
74 wondered work worked working works would wrong
81 ########################################################################
84 # The regular expression fragment that defines the separator between words
86 $wordgap = "(?:[]().?,;:\"']|(?><[^>]*>))*(?:\\s+|\$)(?:[[(\"'`]|(?><[^>]*>))*";
89 ########################################################################
90 # Function to add to a length to accommodate HTML stuff
95 $len += length($1) while ($s =~ /(<\/?[a-z]+>)/ig);
96 $len += 1 while ($s =~ /&#\d+;/g);
102 ########################################################################
103 # Function to write out the list of initials with references
106 my($this_initial) = "$_[0]";
108 print OUT "<p>\n ";
110 foreach $initial (sort keys %initials)
112 if ($initial eq $this_initial)
114 print OUT " <font size=7 color=\"#FF0A0A\"><b>$initial</b></font> ";
118 print OUT "<a href=\"FAQ-KWIC_$initial.html\"> $initial</a>";
122 print OUT " "x4 . "<a href=\"FAQ.html#TOC\">FAQ Contents</a>\n</p>\n";
127 ########################################################################
128 # The main program. We can pick out the contents lines because they lie
129 # between <li> and </li> in the file, sometimes on more than one physical
132 # Turn the list of ignorable words into a hash for quick lookup. Add the
133 # empty word to the list.
135 @words = split /\s+/, $ignore_list;
136 foreach $word (@words) { $ignore{$word} = 1; }
140 # Open the file and do the job
142 open(IN, "html/FAQ.html") || die "Can't open html/FAQ.html\n";
147 $_ .= <IN> while !/<\/li>$/;
151 # Extract the operative text into $text, with the beginning in $pre.
153 my($pre,$text,$post) = /^<li>(.*<\/a>:(?: )*)(.*)<br><br><\/li>$/;
155 # Now split into words. As well as punctuation, there may be HTML thingies
156 # between words. Absorb them into the separators.
158 my(@words) = split /$wordgap/, $text;
160 # Lower case all the words, and remove those that we don't want.
161 # Then keep a list of all the used initials.
164 for ($i = 0; $i < scalar @words; $i++)
166 my($word) = $words[$i] = "\L$words[$i]\E";
168 # Remove certain forms of word and those on the ignore list
170 if (defined $ignore{$word} || # word on ignore list
171 $word =~ /^-+$/ || # word consists entirely of hyphens
172 $word =~ /^-[^a-z]/ || # follows leading hyphen with non-letter
173 $word =~ /^[^a-z-]/ || # starts with a non-letter or hyphen
174 $word =~ /[@^.]/ # contains @ or ^ or .
177 splice(@words, $i, 1);
178 redo REMOVE_IGNORE if $i < scalar @words;
181 # Otherwise, build up a list of initials
187 $initial = substr($inword, 0, 1);
188 $initials{"\U$initial\E"} = 1;
192 # Create the lines for the KWIC index, and store them in associative
193 # arrays, with the keyword as the key. That will get them sorted
196 while (scalar @words > 0)
198 my($word) = shift @words;
199 my($pretext, $casedword, $posttext) =
200 $text =~ /(.*?)(?<![a-z])(\Q$word\E)(?![a-z])(.*)/i;
202 # Remove a leading hyphen from $word so that it sorts according to
203 # the leading letter. What is actually output is $casedword, which
204 # retains the hyphen.
208 my($prelen) = length $pretext;
209 my($postlen) = length $posttext;
211 # We want to chop excessively long entries on either side. We can't set
212 # a fixed length because of the HTML control data. Call a function to
213 # add the given length to allow for HTML stuff. This is crude, but it
214 # does roughtly the right thing.
216 my($leftlen) = &setlen(70, $pretext);
217 my($rightlen) = &setlen(70, $posttext);
219 if ($prelen > $leftlen)
221 my($cutoff) = $leftlen;
223 while ($cutoff < $prelen && substr($pretext, -$cutoff, 1) ne " ");
224 $pretext = "... " . substr($pretext, -$cutoff);
227 if ($postlen > $rightlen)
229 my($cutoff) = $rightlen;
231 while ($cutoff < $postlen && substr($posttext, $cutoff, 1) ne " ");
232 $posttext = substr($posttext, 0, $cutoff) . "...";
235 # If the pre text has a font-ending not preceded by a font beginning
236 # (i.e. we've chopped the beginning off), we must insert a beginning.
238 while ($pretext =~ /^(.*?)<\/(small|tt|b|i)>/ && $1 !~ /<$2>/)
240 $pretext = "<$2>" . $pretext;
243 # If the pre text ends in a special font, we have to terminate that,
244 # and reset it at the start of the post text.
248 while ($pretext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
251 $poststart .= "<$1>";
254 # If the post text changes font but doesn't close it, we must add
257 while ($posttext =~ /<(small|tt|b|i)>(?!.*?<\/\1>)/)
259 $posttext .= "</$1>";
262 # Remove any unnecessary changes in either of them
264 $pretext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
265 $posttext =~ s/<(small|tt|b|i)>\s*<\/\1>//g;
267 # Save the texts in associative arrays. Add the question number to
268 # the end of the word to make the key.
270 $pre =~ /(Q\d\d\d\d)/;
271 my($key) = "$word-$1";
273 $tableft{$key} = $pre . $pretext;
274 $tabright{$key} = $poststart .
275 "<font color=\"#FF0A0A\">$casedword</font>" . $posttext;
281 # Now write out the files. Each letter in the index goes in a different file
283 $current_initial = "";
285 foreach $key (sort keys %tableft)
287 my($initial) = $key =~ /^(.)/;
288 $initial = "\U$initial\E";
290 if ($initial ne $current_initial)
292 if ($current_initial ne "")
294 print OUT "</table>\n";
295 &write_initials($current_initial);
296 print OUT "</body>\n</html>\n";
300 open (OUT, ">html/FAQ-KWIC_$initial.html") ||
301 die "Can't open html/FAQ-KWIC_$initial.html\n";
305 "<title>Exim FAQ: KWIC index section $initial</title>\n" .
307 "<body bgcolor=\"#F8F8F8\" text=\"#00005A\" link=\"#0066FF\" alink=\"#0066FF\" vlink=\"#000099\">\n" .
308 "<h1>Exim FAQ: Keyword-in-context index</h1>\n";
310 write_initials($initial);
316 This <i>Keyword-in-context</i> index for the Exim FAQ is generated
317 automatically from the FAQ source. Browsers may not display the data very
318 prettily, but it is hoped that it may provide a useful aid for finding things
324 print OUT "<table border>\n";
325 $current_initial = $initial;
329 print OUT "<td align=\"right\">$tableft{$key}</td>\n";
330 print OUT "<td align=\"left\">$tabright{$key}</td>\n";
334 # Close the final file
336 if ($current_initial ne "")
338 print OUT "</table>\n";
339 &write_initials($current_initial);
340 print OUT "</body>\n</html>\n";