#! /usr/bin/perl -w # $Cambridge: exim/doc/doc-scripts/g2h,v 1.2 2005/01/27 10:25:35 ph10 Exp $ # This is a script that turns the SGCAL source of Exim's documentation into # HTML. It can be used for both the filter document and the main Exim # specification. The syntax is # # g2h [-split no|section|chapter] # # Previously, -split section was used for the filter document, and -split # chapter for the main specification. However, the filter document has gained # some chapters, so they are both split by chapter now. Only one -split can be # specified. # # A number of assumptions about the style of the input markup are made. # # The HTML is written into the directory html/ using the source file base # name as its base. # Written by Philip Hazel # Starting 21-Dec-2001 # Last modified 26-Nov-2003 ############################################################################# ################################################## # Open an output file # ################################################## sub openout { open (OUT, ">$_[0]") || die "Can't open $_[0]\n"; # Boilerplate print OUT "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML//EN\">\n"; print OUT "<html>\n<head>\n<title>$doctitle" . (($thischapter > 0)? " chapter $thischapter" : "") . (($thissection > 0)? " section $thissection" : "") . "\n\n" . "\n"; # Forward/backward links when chapter splitting if ($chapsplit) { print OUT "\n"; printf OUT ("Previous  \n", $thischapter - 1) if $thischapter > 1; printf OUT ("Next  \n", $thischapter + 1) if $thischapter < $maxchapter; print OUT "Contents\n"; print OUT " " x 6, "($doctitle)\n
\n"; } # Forward/backward links when section splitting elsif ($sectsplit) { print OUT "\n"; printf OUT ("Previous  \n", $thissection - 1) if $thissection > 1; printf OUT ("Next  \n", $thissection + 1) if $thissection < $maxsection; print OUT "Contents\n"; print OUT " " x 6, "($doctitle)\n
\n"; } # Save the final component of the current file name (for TOC creation) $_[0] =~ /^(?:.*)\/([^\/]+)$/; $current_file = $1; } ################################################## # Close an output file # ################################################## # The first argument is one of: # # "CHAP" a chapter is ending # "SECT" a section is ending # "" the whole thing is ending # # In the first two cases $thischapter and $thissection contain the new chapter # and section numbers, respectively. In the third case, we can deduce what is # ending from the flags. The variables contain the current values. sub closeout { my($s) = $_[0]; print OUT "
\n" if !$lastwasrule; &setpar(0); if ($s eq "CHAP") { print OUT "\n"; printf OUT ("Previous  ", $thischapter - 2) if ($thischapter > 2); print OUT "Next  "; print OUT "Contents\n"; print OUT " " x 6, "($doctitle)\n\n"; } elsif ($s eq "SECT") { print OUT "\n"; printf OUT ("Previous  ", $thissection - 2) if ($thissection > 2); print OUT "Next  "; print OUT "Contents\n"; print OUT " " x 6, "($doctitle)\n\n"; } else { if ($chapsplit) { print OUT "\n"; printf OUT ("Previous  ", $thischapter - 1) if ($thischapter > 1); print OUT "Contents\n"; print OUT " " x 6, "($doctitle)\n\n"; } elsif ($sectsplit) { print OUT "\n"; printf OUT ("Previous  ", $thissection - 1) if ($thissection > 1); print OUT "Contents\n"; print OUT " " x 6, "($doctitle)\n\n"; } } print OUT "\n\n"; close(OUT); } ################################################## # Handle an index line # ################################################## # This function returns an empty string so that it can be called as part # of an s operator when handling index items within paragraphs. The two # arguments are: # # the text to index, already converted to HTML # 1 for the concept index, 0 for the options index sub handle_index { my($text) = $_[0]; my($hash) = $_[1]? \%cindex : \%oindex; my ($key,$ref); # Up the index count, and compute the reference to the file and the # label within it. $index_count++; $ref = $chapsplit? "${file_base}_$thischapter.html#IX$index_count" : $sectsplit? "${file_base}_$thissection.html#IX$index_count" : "#IX$index_count"; # Create the index key, which consists of the text with all the HTML # coding and any leading quotation marks removed. Turn the primary/secondary # splitting string "||" into ":". $text =~ s/\|\|/:/g; $key = "$text"; $key =~ s/<[^>]+>//g; $key =~ s/&#(\d+);/chr($1)/eg; $key =~ s/^`+//; $key =~ s/^"//; # Turn all spaces in the text into   so that they don't ever split. # However, there may be spaces in the HTML that already exists in the # text, so we have to avoid changing spaces inside <>. $text =~ s/ (?=[^<>]*(?:<|$))/ /g; # If this is the first encounter with this index key, we create a # straightforward reference. if (!defined $$hash{$key}) { $$hash{$key} = "$text"; } # For the second and subsequent encounters, add "[2]" etc. to the # index text. We find out the number by counting occurrences of "[$number]"; } # Place the name in the current output print OUT "\n"; return ""; } ################################################## # Handle emphasis bars # ################################################## # Set colour green for text marked with "emphasis bars", keeping # track in case the matching isn't perfect. sub setinem { if ($_[0]) { return "" if $inem; $inem = 1; return "\n"; } else { return "" if !$inem; $inem = 0; return "\n"; } } ################################################## # Convert marked-up text # ################################################## # This function converts text from SGCAL markup to HTML markup, with a couple # of exceptions: # # 1. We don't touch $t because that is handled by the .display code. # # 2. The text may contain embedded .index, .em, and .nem directives. We # handle .em and .nem, but leave .index because it must be done during # paragraph outputting. # # In a non-"rm" display, we turn $rm{ into cancelling of . Otherwise # it is ignored - in practice it is only used in that special case. # # The order in which things are done in this function is highly sensitive! sub handle_text { my($s) = $_[0]; my($rmspecial) = $_[1]; # Escape all & characters (they aren't involved in markup) but for the moment # use &+ instead of &# so that we can handle # characters in the text. $s =~ s/&/&+038;/g; # Turn SGCAL literals into HTML literals that don't look like SGCAL # markup, so won't be touched by what follows. Again, use + instead of #. $s =~ s/@@/&+064;/g; $s =~ s/@([^@])/"&+".sprintf("%0.3d",ord($1)).";"/eg; # Now turn any #s that are markup into spaces, and convert the previously # created literals to the correct form. $s =~ s/#/ /g; $s =~ s/&\+(\d+);/&#$1;/g; # Some simple markup that doesn't involve argument text. $s =~ s/\$~//g; # turn $~ into nothing $s =~ s/__/_/g; # turn __ into _ $s =~ s/--(?=$|\s|\d)/–/mg; # turn -- into endash in text or number range $s =~ s/\(c\)/©/g; # turn (c) into copyright symbol # Use double quotes # $s =~ s/`([^']+)'/``$1''/g; $s =~ s/`([^']+)'/“$1”/g; # This is a fudge for some specific usages of $<; can't just do a global # is it occurs in things like "$" as well. $s =~ s/(\d)\$<-/$1-/g; # turn 0$<- into 0- $s =~ s/\$> into equivalent SGCAL markup that doesn't involve the use of # < and >, and then escape the remaining < and > characters in the text. $s =~ s/<<([^>]*?)>>/<\$it{$1}>/g; # turn <> into <$it{xxx}> $s =~ s//>/g; # Other markup... $s =~ s/\$sm\{//g; # turn $sm{ into nothing $s =~ s/\$smc\{//g; # turn $smc{ into nothing $s =~ s/\$smi\{//g; # turn $smi{ into nothing $s =~ s/\$tt\{([^\}]*?)\}/$1<\/tt>/g; # turn $tt{xxx} into xxx $s =~ s/\$it\{([^\}]*?)\}/$1<\/em>/g; # turn $it{xxx} into xxx $s =~ s/\$bf\{([^\}]*?)\}/$1<\/b>/g; # turn $bf{xxx} into xxx $s =~ s/\$cb\{([^\}]*?)\}/$1<\/b><\/tt>/g; # turn $cb{xxx} into # xxx $s =~ s/\\\\([^\\]*?)\\\\/$1<\/font>/g; # turn \\xxx\\ into # small font $s =~ s/\\\?([^?]*?)\?\\/$1<\/a>/g; # turn \?URL?\ into URL $s =~ s/\\\(([^)]*?)\)\\/$1<\/i>/g; # turn \(xxx)\ into xxx $s =~ s/\\\"([^\"]*?)\"\\/$1<\/tt>/g; # turn \"xxx"\ into xxx $s =~ s/\\\$([^\$]*?)\$\\/\$$1<\/tt>/g; # turn \$xxx$\ into $xxx $s =~ s/\\\-([^\\]*?)\-\\/-$1<\/i>/g; # turn \-xxx-\ into -italic $s =~ s/\\\*\*([^*]*?)\*\*\\/$1<\/b>/g; # turn \**xxx**\ into xxx $s =~ s/\\\*([^*]*?)\*\\/$1<\/i>/g; # turn \*xxx*\ into italic $s =~ s/\\%([^*]*?)%\\/$1<\/b>/g; # turn \%xxx%\ into bold $s =~ s/\\([^\\]*?)\\/$1<\/tt>/g; # turn \xxx\ into xxx $s =~ s/::([^\$]*?)::/$1:<\/i>/g; # turn ::xxx:: into italic: $s =~ s/\$\*\$/\*/g; # turn $*$ into * # Handle $rm{...} if ($rmspecial) { $s =~ s/\$rm\{([^\}]*?)\}/<\/tt>$1/g; # turn $rm{xxx} into xxx } else { $s =~ s/\$rm\{([^\}]*?)\}/$1/g; # turn $rm{xxx} into xxx } # There is one case where the terminating } of an escape sequence is # in another paragraph - this follows $sm{ - it can be fixed by # removing any stray } in a paragraph that contains no { chars. $s =~ s/\}//g if !/\{/; # Remove any null flags ($$) $s =~ s/\$\$//g; # If the paragraph starts with $c\b, remove it. $s =~ s/^\$c\b//; # If the paragraph starts with $e\b, indent it slightly. $s =~ s/^\$e\b/  /; # Handle .em, and .nem directives that occur within the paragraph $s =~ s/\.em\s*\n/&setinem(1)/eg; $s =~ s/\.nem\s*\n/&setinem(0)/eg; # Explicitly included HTML $s =~ s/\[\(([^)]+)\)\]/<$1>/g; # turn [(...)] into <...> # Finally, do the substitutions and return the modified text. $s =~ s/~~(\w+)/$var_value{$1}/eg; return $s; } ################################################## # Start/end a paragraph # ################################################## # We want to leave paragraphs unterminated until we know that a horizontal # rule does not follow, to avoid getting space inserted before the rule, # which doesn't look good. So we have this function to help control things. # If the argument is 1 we are starting a new paragraph; if it is 0 we want # to force the ending of any incomplete paragraph. sub setpar { if ($inpar) { print OUT "

\n"; $inpar = 0; } if ($_[0]) { print OUT "

\n"; $inpar = 1; } } ################################################## # Handle a "paragraph" # ################################################## # Read a paragraph of text, which may contain many lines and may contain # .index, .em, and .nem directives within it. We may also encounter # ".if ~~html" within paragraphs. Process those directives, # convert the markup, and output the rest as an HTML paragraph. sub handle_paragraph{ my($par) = $_; my($htmlcond) = 0; while() { if (/^\.if\s+~~html\b/) { $htmlcond = 1; $par =~ s/\s+$//; # lose unwanted whitespace and newlines next; } elsif ($htmlcond && /^\.else\b/) { while () { last if /^\.fi\b/; } $htmlcond = 0; next; } elsif ($htmlcond && /^\.fi\b/) { $htmlcond = 0; next; } last if /^\s*$/ || (/^\./ && !/^\.index\b/ && !/^\.em\b/ && !/^\.nem\b/); $par .= $_; } $par = &handle_text($par, 0); # We can't handle .index until this point, when we do it just before # outputting the paragraph. if ($par !~ /^\s*$/) { &setpar(1); $par =~ s/\.index\s+([^\n]+)\n/&handle_index($1, 1)/eg; print OUT "$par"; } } ################################################## # Handle a non-paragraph directive # ################################################## # The directives .index, .em, and .nem can also appear within paragraphs, # and are then handled within the handle_paragraph() code. sub handle_directive{ my($new_lastwasitem) = 0; $lastwasrule = 0; if (/^\.r?set\b/ || /^\.(?:\s|$)/) {} # ignore .(r)set and comments elsif (/^\.justify\b/) {} # and .justify elsif (/^\.newline\b/) { print OUT "
\n"; } elsif (/^\.blank\b/ || /^\.space\b/) { print OUT "
\n"; } elsif (/^\.rule\b/) { &setpar(0); print OUT "


\n"; $lastwasrule = 1; } elsif (/^\.index\s+(.*)/) { &handle_index(&handle_text($1), 1); } # Emphasis is handled by colour elsif (/^\.em\b/) { &setpar(0); print OUT "" if ! $inem; $inem = 1; } elsif (/^\.nem\b/) { &setpar(0); print OUT "" if $inem; $inem = 0; } # Ignore tab setting stuff - we use tables instead. elsif (/^\.tabs(?:et)?\b/) {} # .tempindent is used only to align some of the expansion stuff nicely; # just ignore it. It is used in conjunction with .push/.pop. elsif (/^\.(tempindent|push|pop)\b/) {} # There are some instances of .if ~~sys.fancy in the source. Some of those # that are not inside displays are two-part things, in which case we just keep # the non-fancy part. For diagrams, however, they are in three parts: # # .if ~~sys.fancy # # .elif !~~html # # .else # # .fi # # In this case, we skip to the third part. elsif (/^\.if\s+~~sys\.fancy/ || /^\.else\b/) { while () { last if /^\.else\b/ || /^\.elif\s+!\s*~~html/ || /^\.fi\b/; } if (/^\.elif\b/) { while () { last if /^\.else\b/ || /^\.fi\b/; } } } # Similarly, for .if !~~sys.fancy, take the non-fancy part. elsif (/^\.if\s+!\s*~~sys.fancy/) {} # There are some explicit tests for ~~html for direct HTML inclusions elsif (/^\.if\s+~~html\b/) {} # There are occasional requirements to do things differently for Texinfo/HTML # and PS/txt versions. The latter are produced by SGCAL, so that's what the # flag is called. elsif (/\.if\s+~~sgcal/) { while () { last if /\.else\b/ || /\.fi\b/; } } # Also there is a texinfo flag elsif (/^\.if\s+~~texinfo\b/) { while () { last if /^\.else\b/ || /^\.elif\s+!\s*~~html/ || /^\.fi\b/; } } # Ignore any other .if, .else, or .fi directives elsif (/^\.if\b/ || /^\.fi\b/ || /^\.else\b/) {} # Ignore .indent elsif (/^\.indent\b/) {} # Various flavours of numberpars map to corresponding list types. elsif (/^\.numberpars\b/) { $rest = $'; &setpar(0); if ($rest =~ /(?:\$\.|\" \")/) { unshift @endlist, "ul"; unshift @listtype, ""; print OUT "\n\n\n"; close(TOC); close(IN); } ################################################## # Adjust index points # ################################################## # Because of the way the source is written, there are often index entries # that immediately follow the start of chapters and sections and the definition # of "items" like "helo = verify". This gets the correct page numbers for the # PostScript and PDF formats. However, for HTML we want the index anchor to be # before the section heading, because browsers tend to put the index point at # the top of the screen. So we re-read all the files we've just created, and # move some of the index points about. This is necessary only if indexes exist. # The files are small enough to be handled entirely in memory. sub adjust_index_points { print "Adjusting index points to precede headings\n"; $" = ""; opendir(DIR, "$html") || die "Failed to opendir $html\n"; while ($file = readdir(DIR)) { my($i); next unless $file =~ /^${file_base}_\d+\.html$/; open(IN, "<$html/$file") || die "Failed to open $html/$file (read)\n"; my(@lines) = ; close(IN); for ($i = 0; $i < @lines; $i++) { if ($lines[$i] =~ /^<\/a>$/) { # Handle an index line that follows a heading definition. Move it back # to just before the

or whatever. This preserves the order of # multiple index lines, not that that matters. if ($lines[$i-1] =~ /^<\/a><\/h(\d)>/) { my($j); my($found) = 0; for ($j = $i-2; $j > 0 && $j > $i - 10; $j--) { if ($lines[$j] =~ //) { $found = 1; last; } } if ($found) { splice(@lines, $j, 0, splice(@lines, $i, 1)); } } # Handle an index line that follows an "item". Move it back one line. elsif ($lines[$i-1] =~ /^.*<\/b>\s*$/) { splice(@lines, $i-1, 0, splice(@lines, $i, 1)); } # Handle an index line that follows a "conf" definition elsif ($lines[$i-1] =~ /^Type:<\/i>/ && $lines[$i-2] =~ /^

/) { splice(@lines, $i-2, 0, splice(@lines, $i, 1)); } # Handle an index line that follows an "option" definition elsif ($lines[$i-1] =~ /^

/) { splice(@lines, $i-1, 0, splice(@lines, $i, 1)); } } } open(OUT, ">$html/$file") || die "Failed to open $html/$file (write)\n"; print OUT "@lines"; close OUT; undef @lines; } } ################################################## # Create Index # ################################################## sub create_index{ my($hash) = $_[0]; my($ifname) = $_[1]; my($ititle) = $_[2]; my(%indexindex); open(INDEX, ">$html/${file_base}_$_[1].html") || die "Failed to open $html/${file_base}_$ifname\n"; print INDEX "\n"; print INDEX "\n\n$doctitle $ititle\n"; print INDEX "\n\n"; print INDEX "\n"; print INDEX "

$ititle

\n"; # We have to scan the keys in the hash twice; first to build the list # of initial letters, and then to do the business. The first time we # do not need to sort them. foreach $key (keys %$hash) { my($initial) = substr($key,0,1); $initial = "\U$initial"; $indexindex{$initial} = 1 if $initial ge "A" && $initial le "Z"; } print INDEX "

\n"; foreach $key (sort keys %indexindex) { print INDEX " $key\n"; } print INDEX "


\n"; my($letter) = ""; print INDEX "

\n"; foreach $key (sort { my($aa) = $a; my($bb) = $b; $aa =~ s/^\x93//; # Seems like the actual char values are $bb =~ s/^\x93//; # set by this time, not "“" return ("\L$aa" eq "\L$bb")? ("$aa" cmp "$bb") : ("\L$aa" cmp "\L$bb"); } keys %$hash) { my($initial) = substr($key,0,1); $initial = "\U$initial"; if ($initial ne $letter && $initial ge "A" && $initial le "Z") { print INDEX "
\n"; print INDEX "\n"; print INDEX "\U$initial\E
\n"; $letter = $initial; } print INDEX "$$hash{$key}
\n"; } print INDEX "

\n"; print INDEX "\n\n"; close(INDEX); } ################################################## # Show usage and die # ################################################## sub usage { die "Usage: g2h [-split no|section|chapter] \n"; } ################################################## # Entry point and main program # ################################################## # Directory in which to put the new HTML files $html = "html"; # Global variables. %cindex = (); %oindex = (); $chapsplit = 0; $cindex_tocn = 0; $confuse = ""; $file_base = ""; $index_count = 0; $inem = 0; $inpar = 0; $lastwasitem = 0; $lastwasrule = 0; $oindex_tocn = 0; $sectsplit = 0; $source_file = ""; $thischapter = 0; $thissection = 0; # Handle options my($splitset) = 0; while (scalar @ARGV > 0 && $ARGV[0] =~ /^-/) { if ($ARGV[0] eq "-split" && !$splitset) { $splitset = 1; shift @ARGV; my($type) = shift @ARGV; if ($type eq "section") { $sectsplit = 1; } elsif ($type eq "chapter") { $chapsplit = 1; } elsif ($type eq "no" ) { $sectsplit = $chapsplit = 0; } else { &usage(); } } else { &usage(); } } # Get the source file and its base &usage() if scalar @ARGV <= 0; $source_file = shift @ARGV; ($file_base) = $source_file =~ /^(.*)\.src$/; &usage() if scalar @ARGV <= 0; $doctitle = shift @ARGV; print "\nCreate HTML for $doctitle from $source_file\n"; # Remove the old HTML files print "Removing old HTML files\n"; system("/bin/rm -rf $html/${file_base}_*.html"); # First pass identifies all the chapters and sections, and collects the # values of the cross-referencing variables. print "Scanning for cross-references\n"; &pass_one(); $maxchapter = $thischapter; # Used if chapter splitting $maxsection = $thissection; # Used if section splitting # Second pass actually creates the HTML files. print "Creating the HTML files\n"; &pass_two(); # Reprocess for moving some of the index points, if indexes were created &adjust_index_points() if scalar(keys %cindex) > 0 || scalar(keys %oindex) > 0; # Finally, we must create the option and concept indexes if any data # has been collected for them. if (scalar(keys %cindex) > 0) { print "Creating concept index\n"; &create_index(\%cindex, "cindex", "Concepts"); } if (scalar(keys %oindex) > 0) { print "Creating option index\n"; &create_index(\%oindex, "oindex", "Options"); } # End of g2h