From 263c04a6b6ad8a18c9cdb7da847b695f0a8d6787 Mon Sep 17 00:00:00 2001 From: "Heiko Schlittermann (HS12)" Date: Sun, 10 May 2015 16:01:44 +0200 Subject: [PATCH] Docs: Make build unicode resistant Force LC_ALL=C for spec.txt. Add an additional build target: spec.utf8. --- doc/doc-docbook/.gitignore | 1 + doc/doc-docbook/GenLocalParams | 8 ++++ doc/doc-docbook/Makefile | 40 ++++++++++++++------ doc/doc-docbook/Tidytxt | 67 ++++++++++++++++++++++------------ 4 files changed, 81 insertions(+), 35 deletions(-) diff --git a/doc/doc-docbook/.gitignore b/doc/doc-docbook/.gitignore index ae93d1875..62828bf6f 100644 --- a/doc/doc-docbook/.gitignore +++ b/doc/doc-docbook/.gitignore @@ -3,6 +3,7 @@ spec*.xml spec.ps spec.pdf spec.txt +spec.utf8 filter*.xml filter.ps filter.pdf diff --git a/doc/doc-docbook/GenLocalParams b/doc/doc-docbook/GenLocalParams index 140890880..fc8e7fc58 100755 --- a/doc/doc-docbook/GenLocalParams +++ b/doc/doc-docbook/GenLocalParams @@ -5,8 +5,16 @@ output="${1:-local_params}" nicedate="$(date +"%d %b %Y")" +if which locale >/dev/null; then + charset="$(locale | grep ^LC_CTYPE=)" +else + charset=unknown +fi + exec > "$output" cat <filter.txt + LC_ALL=C w3m -dump filter-txt.html | ./Tidytxt >filter.txt ./SanityTestText filter.txt # I have not found a way of making docbook2texi write its output anywhere @@ -107,8 +115,8 @@ filter.info: filter-info.xml ################################ SPEC ################################## -spec.xml: local_params spec.xfpt - xfpt spec.xfpt +spec.xml: spec.xfpt local_params + xfpt $< spec-pr.xml: spec.xml Pre-xml ./Pre-xml -optbreak spec-pr.xml @@ -165,13 +173,21 @@ spec.pdf: sdop-spec.pdf ### ### -spec.txt: spec-txt.xml Tidytxt MyStyle-txt-html.xsl MyStyle-html.xsl \ - MyStyle.xsl - /bin/rm -rf spec-txt.html - xmlto -x MyStyle-txt-html.xsl html-nochunks spec-txt.xml - w3m -dump spec-txt.html | ./Tidytxt >spec.txt +spec-txt.html: spec-txt.xml \ + MyStyle-txt-html.xsl MyStyle-html.xsl MyStyle.xsl + xmlto -x MyStyle-txt-html.xsl html-nochunks $< + +spec.utf8: spec-txt.html Tidytxt + @grep -iq 'LC_CTYPE=.*utf-\?8' local_params || { \ + echo 'your current locale does not support UTF-8' >&2; \ + false; } + w3m -dump $< | ./Tidytxt -utf8 >$@ + +spec.txt: spec-txt.html Tidytxt + LC_ALL=C w3m -dump $< | ./Tidytxt >$@ ./SanityTestText spec.txt + # I have not found a way of making docbook2texi write its output anywhere # other than the file name that it makes up. The --to-stdout option does not # work. diff --git a/doc/doc-docbook/Tidytxt b/doc/doc-docbook/Tidytxt index 9eb63dbcb..cfa692272 100755 --- a/doc/doc-docbook/Tidytxt +++ b/doc/doc-docbook/Tidytxt @@ -1,5 +1,20 @@ #! /usr/bin/perl +use strict; +use warnings; +use Getopt::Long; + + +# For now we can't rely on a perl >= 5.14 on +# the build sites, thus we throw away all unicode +# awarness and do the matching byte by byte +binmode STDIN; +binmode STDOUT; + +GetOptions( + 'u|utf8!' => \my $want_utf8, # do not replace unicode characters +) or die "Usage: $0 [-u|--utf8]\n"; + # Script to tidy up the output of w3m when it makes a text file. First we # convert sequences of blank lines into a single blank line, to get everything # uniform. Then we go through and insert blank lines before chapter and @@ -11,32 +26,36 @@ # (2) It uses U+25CF as its bullet character. # (3) It inserts a whole slew of "box drawing" characters round the heading. -@lines = <>; +my @lines = <>; +my $lastwasblank = 0; -$lastwasblank = 0; -foreach $line (@lines) +foreach my $line (@lines) { # (1) non-break space -> normal space $line =~ s/\x{c2}\x{a0}/ /g; - # (2) bullet -> asterisk - $line =~ s/\x{e2}\x{97}\x{8f}/*/g; - $line =~ s/\x{e2}\x{80}\x{a2}/*/g; # OpenSUSE - $line =~ s/\x{e2}\x{96}\x{a1}/*/g; # OpenSUSE - # (3a) horizontal box drawing -> hyphen - $line =~ s/\x{e2}\x{94}[\x{80}\x{81}\x{84}\x{85}\x{88}\x{89}]/-/g; - $line =~ s/\x{e2}\x{95}[\x{8c}\x{8d}\x{90}]/-/g; - $line =~ s/\x{e2}\x{95}[\x{b4}\x{b6}\x{b8}\x{ba}\x{bc}\x{be}]/-/g; - # (3b) vertical box drawing -> bar - $line =~ s/\x{e2}\x{94}[\x{82}\x{83}\x{86}\x{87}\x{8a}\x{8b}]/|/g; - $line =~ s/\x{e2}\x{95}[\x{8e}\x{8f}\x{91}]/|/g; - $line =~ s/\x{e2}\x{95}[\x{b5}\x{b7}\x{b9}\x{bb}\x{bd}\x{bf}]/|/g; - # (3c) corner box drawing -> plus - $line =~ s/\x{e2}\x{94}[\x{8c}-\x{bf}]/+/g; - $line =~ s/\x{e2}\x{95}[\x{80}-\x{8b}\x{92}-\x{b0}]/+/g; - # other - $line =~ s/\x{e2}\x{95}\x{b1}/\//g; - $line =~ s/\x{e2}\x{95}\x{b2}/\\/g; - $line =~ s/\x{e2}\x{95}\x{b3}/X/g; + + unless ($want_utf8) + { + # (2) bullet -> asterisk + $line =~ s/\x{e2}\x{97}\x{8f}/*/g; + $line =~ s/\x{e2}\x{80}\x{a2}/*/g; # OpenSUSE + $line =~ s/\x{e2}\x{96}\x{a1}/*/g; # OpenSUSE + # (3a) horizontal box drawing -> hyphen + $line =~ s/\x{e2}\x{94}[\x{80}\x{81}\x{84}\x{85}\x{88}\x{89}]/-/g; + $line =~ s/\x{e2}\x{95}[\x{8c}\x{8d}\x{90}]/-/g; + $line =~ s/\x{e2}\x{95}[\x{b4}\x{b6}\x{b8}\x{ba}\x{bc}\x{be}]/-/g; + # (3b) vertical box drawing -> bar + $line =~ s/\x{e2}\x{94}[\x{82}\x{83}\x{86}\x{87}\x{8a}\x{8b}]/|/g; + $line =~ s/\x{e2}\x{95}[\x{8e}\x{8f}\x{91}]/|/g; + $line =~ s/\x{e2}\x{95}[\x{b5}\x{b7}\x{b9}\x{bb}\x{bd}\x{bf}]/|/g; + # (3c) corner box drawing -> plus + $line =~ s/\x{e2}\x{94}[\x{8c}-\x{bf}]/+/g; + $line =~ s/\x{e2}\x{95}[\x{80}-\x{8b}\x{92}-\x{b0}]/+/g; + # other + $line =~ s/\x{e2}\x{95}\x{b1}/\//g; + $line =~ s/\x{e2}\x{95}\x{b2}/\\/g; + $line =~ s/\x{e2}\x{95}\x{b3}/X/g; + } # w3m rendering issue apparently only seen by pdp # affects section numbers after the ToC, some info on spool-file -lines, etc @@ -54,6 +73,7 @@ foreach $line (@lines) # Find start of TOC, uppercasing its title +my $i = 0; for ($i = 0; $i < scalar @lines; $i++) { $lines[$i] = "TABLE OF CONTENTS\n" if $lines[$i] =~ /^Table of Contents/; @@ -69,7 +89,8 @@ for ($i++; $i < scalar @lines; $i++) # looking for preceding and following blank lines, and then matching against # the numbers. -$chapter = 0; +my $chapter = 0; +my $section; for (; $i < scalar @lines; $i++) { next if $lines[$i-1] !~ /^$/ || $lines[$i+1] !~ /^$/; -- 2.30.2