#! /usr/bin/perl # Script to pre-process XML input before processing it for various purposes. # Options specify which transformations are to be done. Monospaced literal # layout blocks are never touched. # Changes: # -ascii: Replace ’ by ' # Replace © by (c) # Replace † by * # Replace ‡ by ** # Replace by a space # Replace – by - # Put quotes round <quote> text # # -quoteliteral: # Put quotes round <literal> text # # -bookinfo: Remove the <bookinfo> element from the file # # -fi: Replace "fi" by fi except when it is in an XML element, or # inside a <literal>. # # -html: Certain things are done only for HTML output: # # If <literallayout> is followed by optional space and then a # newline, the space and newline are removed, because otherwise you # get a blank line in the HTML output. # # -noindex Remove the XML that generates indexes. # -oneindex Ditto, but add XML to generate a single index. # # -optbreak Insert an optional line break (zero-width space, ​) after # every underscore in text within <option> and <variable> elements, # except when preceded by <entry> (i.e. not in tables). The same is # also done within a word of four or more upper-case letters (for # compile-time options). # The function that processes non-literal, non-monospaced text sub process() { my($s) = $_[0]; $s =~ s/fi(?![^<>]*>)/fi/g if $ligatures; if ($optbreak) { $s =~ s%(?<!<entry>)(<option>|<varname>)([^<]+)% my($x,$y) = ($1,$2); $y =~ s/_/_​/g; "$x"."$y"%gex; $s =~ s?\b([A-Z_]{4,})\b? my($x) = $1; $x =~ s/_/_​/g; "$x"?gex; } if ($ascii) { $s =~ s/’/'/g; $s =~ s/©/(c)/g; $s =~ s/†/*/g; $s =~ s/‡/**/g; $s =~ s/&nsbp;/ /g; $s =~ s/–/-/g; $s =~ s/<quote>/"/g; $s =~ s/<\/quote>/"/g; } $s; } # The main program $ascii = 0; $bookinfo = 0; $html = 0; $inliteral = 0; $inliterallayout = 0; $ligatures = 0; $madeindex = 0; $noindex = 0; $oneindex = 0; $optbreak = 0; $quoteliteral = 0; foreach $arg (@ARGV) { if ($arg eq "-fi") { $ligatures = 1; } elsif ($arg eq "-ascii") { $ascii = 1; } elsif ($arg eq "-bookinfo") { $bookinfo = 1; } elsif ($arg eq "-html") { $html = 1; } elsif ($arg eq "-noindex") { $noindex = 1; } elsif ($arg eq "-oneindex") { $oneindex = 1; } elsif ($arg eq "-optbreak") { $optbreak = 1; } elsif ($arg eq "-quoteliteral") { $quoteliteral = 1; } else { die "** Pre-xml: Unknown option \"$arg\"\n"; } } while (<STDIN>) { # Remove <bookinfo> if required if ($bookinfo && /^<bookinfo/) { while (<STDIN>) { last if /^<\/bookinfo/; } next; } # Copy monospaced literallayout blocks if (/^<literallayout class="monospaced">/) { $_ = substr($_, 0, -1) if $html; print; while (<STDIN>) { print; last if /^<\/literallayout>/; } next; } # Adjust index-generation code if required if (($noindex || $oneindex) && /^<index[\s>]/) { while (<STDIN>) { last if /^<\/index>/; } if ($oneindex && !$madeindex) { $madeindex = 1; print "<index><title>Index</title></index>\n"; } next; } # A line that is not in a monospaced literal block; keep track of which # parts are in <literal> and which not. The latter get processed by the # function above. Items in <literal> get quoted unless they are also in # a <literallayout> block, or are already being quoted. for (;;) { $_ = substr($_, 0, -1) if $html && /^<literallayout[^>]*>\s*\n$/; $inliterallayout = 1 if /^<literallayout/; $inliterallayout = 0 if /^<\/literallayout/; if ($inliteral) { if (/^(.*?)<\/literal>(?!<\/quote>)(.*)$/) { print $1; print "\"" if $quoteliteral && !$inliterallayout; print "</literal>"; $inliteral = 0; $_ = "$2\n"; } else { print; last; } } # Not in literal state else { if (/^(.*?)(?<!<quote>)<literal>(.*)$/) { print &process($1); print "<literal>"; print "\"" if $quoteliteral && !$inliterallayout; $inliteral = 1; $_ = "$2\n"; } else { print &process($_); last; } } } # Loop for different parts of one line } # Loop for multiple lines # End