3 # Script to pre-process XML input before processing it for various purposes.
4 # Options specify which transformations are to be done. Monospaced literal
5 # layout blocks are never touched.
9 # -ascii: Replace ’ by '
10 # Replace © by (c)
11 # Replace † by *
12 # Replace ‡ by **
13 # Replace by a space
14 # Replace – by -
15 # Put quotes round <quote> text
18 # Put quotes round <literal> text
20 # -bookinfo: Remove the <bookinfo> element from the file
22 # -fi: Replace "fi" by fi except when it is in an XML element, or
25 # -html: Certain things are done only for HTML output:
27 # If <literallayout> is followed by optional space and then a
28 # newline, the space and newline are removed, because otherwise you
29 # get a blank line in the HTML output.
31 # -noindex Remove the XML that generates indexes.
32 # -oneindex Ditto, but add XML to generate a single index.
34 # -optbreak Insert an optional line break (zero-width space, ​) after
35 # every underscore in text within <option> and <variable> elements,
36 # except when preceded by <entry> (i.e. not in tables). The same is
37 # also done within a word of four or more upper-case letters (for
38 # compile-time options).
40 # -epub Convert date formats to comply with epub specification
44 # The function that processes non-literal, non-monospaced text
50 $s =~ s/fi(?![^<>]*>)/fi/g if $ligatures;
54 $s =~ s%(?<!<entry>)(<option>|<varname>)([^<]+)%
55 my($x,$y) = ($1,$2); $y =~ s/_/_​/g; "$x"."$y"%gex;
57 $s =~ s?\b([A-Z_]{4,})\b?
58 my($x) = $1; $x =~ s/_/_​/g; "$x"?gex;
66 $s =~ s/‡/**/g;
70 $s =~ s/<\/quote>/"/g;
76 # Mapping needed for epub
79 jan => '01', feb => '02', mar => '03', apr => '04', may => '05', jun => '06',
80 jul => '07', aug => '08', sep => '09', oct => '10', nov => '11', dec => '12',
100 if ($arg eq "-fi") { $ligatures = 1; }
101 elsif ($arg eq "-ascii") { $ascii = 1; }
102 elsif ($arg eq "-bookinfo") { $bookinfo = 1; }
103 elsif ($arg eq "-epub") { $epub = 1; }
104 elsif ($arg eq "-html") { $html = 1; }
105 elsif ($arg eq "-noindex") { $noindex = 1; }
106 elsif ($arg eq "-oneindex") { $oneindex = 1; }
107 elsif ($arg eq "-optbreak") { $optbreak = 1; }
108 elsif ($arg eq "-quoteliteral") { $quoteliteral = 1; }
109 else { die "** Pre-xml: Unknown option \"$arg\"\n"; }
114 # Remove <bookinfo> if required
116 if ($bookinfo && /^<bookinfo/)
118 while (<STDIN>) { last if /^<\/bookinfo/; }
122 # Copy monospaced literallayout blocks
124 if (/^<literallayout class="monospaced">/)
126 $_ = substr($_, 0, -1) if $html;
131 last if /^<\/literallayout>/;
136 # Adjust index-generation code if required
138 if (($noindex || $oneindex) && /^<index[\s>]/)
142 last if /^<\/index>/;
145 if ($oneindex && !$madeindex)
148 print "<index><title>Index</title></index>\n";
154 # Adjust dates to YYYY-MM-DD
156 if ($epub && /^\s*<date[\s>]/)
161 last if /^\s*<\/date>/;
163 if (/^ \s* (\d{1,2}) \s+ ([a-zA-Z]{3}) \s+ (\d{4})/x)
165 die "Unknown month '$2'\n" unless exists $months{lc $2};
166 my $month = $months{lc $2};
173 # A line that is not in a monospaced literal block; keep track of which
174 # parts are in <literal> and which not. The latter get processed by the
175 # function above. Items in <literal> get quoted unless they are also in
176 # a <literallayout> block, or are already being quoted.
180 $_ = substr($_, 0, -1) if $html && /^<literallayout[^>]*>\s*\n$/;
181 $inliterallayout = 1 if /^<literallayout/;
182 $inliterallayout = 0 if /^<\/literallayout/;
186 if (/^(.*?)<\/literal>(?!<\/quote>)(.*)$/)
189 print "\"" if $quoteliteral && !$inliterallayout;
201 # Not in literal state
205 if (/^(.*?)(?<!<quote>)<literal>(.*)$/)
209 print "\"" if $quoteliteral && !$inliterallayout;
219 } # Loop for different parts of one line
220 } # Loop for multiple lines