From b32a971138c1120763af565a142787cf3175ced7 Mon Sep 17 00:00:00 2001 From: Phil Pennock Date: Sat, 5 Feb 2011 00:23:31 -0500 Subject: [PATCH] Strip \x{c2} from .txt files and audit. Am unable to keep the build process from inserting spurious \x{c2} characters into the created .txt files. Strip the characters in Tidytxt. Add SanityTestText to do a final audit for non-ASCII characters in the .txt files. Dependency: pcregrep if available, else uses Perl. --- doc/doc-docbook/Makefile | 2 ++ doc/doc-docbook/SanityTestText | 36 ++++++++++++++++++++++++++++++++++ doc/doc-docbook/Tidytxt | 5 +++++ 3 files changed, 43 insertions(+) create mode 100755 doc/doc-docbook/SanityTestText diff --git a/doc/doc-docbook/Makefile b/doc/doc-docbook/Makefile index ed0ad8435..4f9232812 100644 --- a/doc/doc-docbook/Makefile +++ b/doc/doc-docbook/Makefile @@ -88,6 +88,7 @@ filter.txt: filter-txt.xml Tidytxt MyStyle-txt-html.xsl MyStyle-html.xsl \ /bin/rm -rf filter-txt.html xmlto -x MyStyle-txt-html.xsl html-nochunks filter-txt.xml w3m -dump filter-txt.html | ./Tidytxt >filter.txt + ./SanityTestText filter.txt # I have not found a way of making docbook2texi write its output anywhere # other than the file name that it makes up. The --to-stdout option does not @@ -168,6 +169,7 @@ spec.txt: spec-txt.xml Tidytxt MyStyle-txt-html.xsl MyStyle-html.xsl \ /bin/rm -rf spec-txt.html xmlto -x MyStyle-txt-html.xsl html-nochunks spec-txt.xml w3m -dump spec-txt.html | ./Tidytxt >spec.txt + ./SanityTestText spec.txt # I have not found a way of making docbook2texi write its output anywhere # other than the file name that it makes up. The --to-stdout option does not diff --git a/doc/doc-docbook/SanityTestText b/doc/doc-docbook/SanityTestText new file mode 100755 index 000000000..25b181e3d --- /dev/null +++ b/doc/doc-docbook/SanityTestText @@ -0,0 +1,36 @@ +#!/bin/sh + +# Portability note: +# This tool is only used in building spec.txt for a release, not used as +# part of the normal build/install process, so only Maintainers are affected +# by requirements here. + +filename="$1" + +if echo a | pcregrep -q a 2>/dev/null +then + pcregrep -q '[^\x{20}-\x{7E}]' "$filename" + grepstatus=$? +else + perl -ne 'BEGIN {$rv=1};END {exit $rv}; + if (/[^\r\n\x{20}-\x{7E}]/) { $rv = 0; last }' < "$filename" + grepstatus=$? +fi + +case $grepstatus in +0) + echo >&2 "$0: found non-ASCII characters in $filename" + exit 1 + ;; +1) + exit 0 + ;; +2) + echo >&2 "$0: problem checking for non-ASCII characters in $filename" + exit 2 + ;; +*) + echo >&2 "$0: unhandled return value from pcregrep: $grepstatus" + exit 3 + ;; +esac diff --git a/doc/doc-docbook/Tidytxt b/doc/doc-docbook/Tidytxt index a56124ccf..a628b6dcf 100755 --- a/doc/doc-docbook/Tidytxt +++ b/doc/doc-docbook/Tidytxt @@ -38,6 +38,11 @@ foreach $line (@lines) $line =~ s/\x{e2}\x{95}\x{b2}/\\/g; $line =~ s/\x{e2}\x{95}\x{b3}/X/g; + # w3m rendering issue apparently only seen by pdp + # affects section numbers after the ToC, some info on spool-file -lines, etc + # always appears to be a spurious extra character, safely just dropped. + $line =~ s/\x{c2}//g; + if ($line =~ /^\s*$/) { $line = "" if $lastwasblank; -- 2.30.2