1 #dehtml.awk: Removes all HTML tags from file, preliminary to spell check; common
2 # ampersand "&entities;" are also resolved into single characters.
6 # awk -f dehtml.awk infile.html > outfile.txt
8 # This program is written in the ``awk'' programming language (on Sun systems
9 # and some others, non-archaic ``awk'' is called ``nawk'', so that ``nawk''
10 # should be used instead of ``awk''). Also, a freely-redistributable ``awk''
11 # interpreter called ``gawk'', which is free of the bugs that some of the
12 # vendor-supplied ``awk''/``nawk'' programs suffer from, is available for most
13 # platforms, and as source from the FSF GNU project.
15 # This program processes all files on the command line to STDOUT; to process a
16 # number of files individually, use the iteration mechanism of your shell; for
19 # for a in *.html ; do awk -f dehtml.awk$a > otherdir/$a ; done
23 # for %a in (*.htm) do call dehtml %a otherdir\%a
25 # in MS-DOS, where dehtml.bat is the following one-line batch file:
27 # gawk -f dehtml.awk %1 > %2
29 # Copyright H. Churchyard 1994, 1995 -- freely redistributable.
31 # Version 1.0 11/27/94 -- Included in htmlchek 3.0 release.
32 # Version 1.1 12/6/94 -- Fixed minor bug which could unpredictably cause a
33 # string such as "é" to be reduced into a single character;
34 # added "­". Included in htmlchek 3.01 release.
35 # Version 1.2 1/12/95 -- No error on `>' outside tag; minor bugfix. Included
36 # in htmlchek 4.0 release.
38 #This will test the 8-bit-cleanliness of your awk:
40 amp
["&[\043]32;"]=
"\040";amp
[" "]=
"\040";
41 amp
["&[\043]34;"]=
"\042";amp
["""]=
"\042";
42 amp
["&[\043]60;"]=
"\074";amp
["<"]=
"\074";amp
["&[\043]62;"]=
"\076";
43 amp
[">"]=
"\076";amp
["À"]=
"\300";amp
["Á"]=
"\301";
44 amp
["Â"]=
"\302";amp
["Ã"]=
"\303";amp
["Ä"]=
"\304";
45 amp
["Å"]=
"\305";amp
["Æ"]=
"\306";amp
["Ç"]=
"\307";
46 amp
["È"]=
"\310";amp
["É"]=
"\311";amp
["Ê"]=
"\312";
47 amp
["Ë"]=
"\313";amp
["Ì"]=
"\314";amp
["Í"]=
"\315";
48 amp
["Î"]=
"\316";amp
["Ï"]=
"\317";amp
["Ð"]=
"\320";
49 amp
["Ñ"]=
"\321";amp
["Ò"]=
"\322";amp
["Ó"]=
"\323";
50 amp
["Ô"]=
"\324";amp
["Õ"]=
"\325";amp
["Ö"]=
"\326";
51 amp
["Ø"]=
"\330";amp
["Ù"]=
"\331";amp
["Ú"]=
"\332";
52 amp
["Û"]=
"\333";amp
["Ü"]=
"\334";amp
["Ý"]=
"\335";
53 amp
["Þ"]=
"\336";amp
["ß"]=
"\337";amp
["à"]=
"\340";
54 amp
["á"]=
"\341";amp
["â"]=
"\342";amp
["ã"]=
"\343";
55 amp
["ä"]=
"\344";amp
["å"]=
"\345";amp
["æ"]=
"\346";
56 amp
["ç"]=
"\347";amp
["è"]=
"\350";amp
["é"]=
"\351";
57 amp
["ê"]=
"\352";amp
["ë"]=
"\353";amp
["ì"]=
"\354";
58 amp
["í"]=
"\355";amp
["î"]=
"\356";amp
["ï"]=
"\357";
59 amp
["ð"]=
"\360";amp
["ñ"]=
"\361";amp
["ò"]=
"\362";
60 amp
["ó"]=
"\363";amp
["ô"]=
"\364";amp
["õ"]=
"\365";
61 amp
["ö"]=
"\366";amp
["ø"]=
"\370";amp
["ù"]=
"\371";
62 amp
["ú"]=
"\372";amp
["û"]=
"\373";amp
["ü"]=
"\374";
63 amp
["ý"]=
"\375";amp
["þ"]=
"\376";amp
["ÿ"]=
"\377";
64 amp
["®"]=
"\256";amp
["©"]=
"\251";amp
["&[\043]163;"]=
"\243";
70 # Variable ``state'' is one if unresolved `<', zero otherwise.
72 {line=
"";errstr=
"";erra=
0;errb=
0;currsrch=
1;txtbeg=
1;
73 while (match(substr($
0,currsrch
),/[<>]/)!=0)
74 {currsrch=
(currsrch
+RSTART);
75 if (substr($
0,(currsrch
-1),1)==
"<")
78 {errstr=
(errstr
"&&^Multiple `<' without `>' ERROR!, Ignoring^&&\n");
80 else {if ((currsrch
>length($
0))||(substr($
0,currsrch
,1)~
/^
[ \t]$
/))
82 {errstr=
(errstr
"&&^Whitespace after `<': Bad SGML syntax ERROR!, Ignoring^&&\n");
84 else {if (currsrch
>(txtbeg
+1))
85 {line=
(line
substr($
0,txtbeg
,(currsrch
-(txtbeg
+1))))};
87 else {if (substr($
0,(currsrch
-1),1)==
">")
89 {continue} #`>' without `<'
90 else {txtbeg=currsrch
;state=
0;}}
91 else {print "Internal error, ignore"}}};
93 if ((!state
)&&(txtbeg
<=
length($
0))) {line=
(line
substr($
0,txtbeg
))};
94 if (line~
/&[\043]?
[-0-9a
-zA
-Z.
]*;/)
95 {for (x in amp
) {gsub(x
,amp
[x
],line
);if (line!~
/&/) {break}};
96 gsub(/&([\043]38|amp
);/,"\\&",line
)};
97 if ((line
)||((!state
)&&($
0~
/^$
/)))
98 {if ((!state
) || (errstr
) || (line~
/[ \t]$
/))
100 else {printf "%s",line
}};
101 if (errstr
) {printf "%s",errstr
}}
103 #Minor bug: &g<X>t; will translate to a `>' character!
106 END{if (state
) {print "&&^Was awaiting a `>' ERROR! at END^&&"}}