updated on Thu Jan 26 16:09:46 UTC 2012
[aur-mirror.git] / djbdns-opendns / dehtml.awk
blobd6e51ba62491054b38a33f1e138eac4275f8ddb0
1 #dehtml.awk: Removes all HTML tags from file, preliminary to spell check; common
2 # ampersand "&entities;" are also resolved into single characters.
4 # Typical use:
6 # awk -f dehtml.awk infile.html > outfile.txt
8 # This program is written in the ``awk'' programming language (on Sun systems
9 # and some others, non-archaic ``awk'' is called ``nawk'', so that ``nawk''
10 # should be used instead of ``awk''). Also, a freely-redistributable ``awk''
11 # interpreter called ``gawk'', which is free of the bugs that some of the
12 # vendor-supplied ``awk''/``nawk'' programs suffer from, is available for most
13 # platforms, and as source from the FSF GNU project.
15 # This program processes all files on the command line to STDOUT; to process a
16 # number of files individually, use the iteration mechanism of your shell; for
17 # example:
19 # for a in *.html ; do awk -f dehtml.awk$a > otherdir/$a ; done
21 # in Unix sh, or:
23 # for %a in (*.htm) do call dehtml %a otherdir\%a
25 # in MS-DOS, where dehtml.bat is the following one-line batch file:
27 # gawk -f dehtml.awk %1 > %2
29 # Copyright H. Churchyard 1994, 1995 -- freely redistributable.
31 # Version 1.0 11/27/94 -- Included in htmlchek 3.0 release.
32 # Version 1.1 12/6/94 -- Fixed minor bug which could unpredictably cause a
33 # string such as "é" to be reduced into a single character;
34 # added "­". Included in htmlchek 3.01 release.
35 # Version 1.2 1/12/95 -- No error on `>' outside tag; minor bugfix. Included
36 # in htmlchek 4.0 release.
38 #This will test the 8-bit-cleanliness of your awk:
39 BEGIN{
40 amp["&[\043]32;"]="\040";amp[" "]="\040";
41 amp["&[\043]34;"]="\042";amp["""]="\042";
42 amp["&[\043]60;"]="\074";amp["<"]="\074";amp["&[\043]62;"]="\076";
43 amp[">"]="\076";amp["À"]="\300";amp["Á"]="\301";
44 amp["Â"]="\302";amp["Ã"]="\303";amp["Ä"]="\304";
45 amp["Å"]="\305";amp["Æ"]="\306";amp["Ç"]="\307";
46 amp["È"]="\310";amp["É"]="\311";amp["Ê"]="\312";
47 amp["Ë"]="\313";amp["Ì"]="\314";amp["Í"]="\315";
48 amp["Î"]="\316";amp["Ï"]="\317";amp["Ð"]="\320";
49 amp["Ñ"]="\321";amp["Ò"]="\322";amp["Ó"]="\323";
50 amp["Ô"]="\324";amp["Õ"]="\325";amp["Ö"]="\326";
51 amp["Ø"]="\330";amp["Ù"]="\331";amp["Ú"]="\332";
52 amp["Û"]="\333";amp["Ü"]="\334";amp["Ý"]="\335";
53 amp["Þ"]="\336";amp["ß"]="\337";amp["à"]="\340";
54 amp["á"]="\341";amp["â"]="\342";amp["ã"]="\343";
55 amp["ä"]="\344";amp["å"]="\345";amp["æ"]="\346";
56 amp["ç"]="\347";amp["è"]="\350";amp["é"]="\351";
57 amp["ê"]="\352";amp["ë"]="\353";amp["ì"]="\354";
58 amp["í"]="\355";amp["î"]="\356";amp["ï"]="\357";
59 amp["ð"]="\360";amp["ñ"]="\361";amp["ò"]="\362";
60 amp["ó"]="\363";amp["ô"]="\364";amp["õ"]="\365";
61 amp["ö"]="\366";amp["ø"]="\370";amp["ù"]="\371";
62 amp["ú"]="\372";amp["û"]="\373";amp["ü"]="\374";
63 amp["ý"]="\375";amp["þ"]="\376";amp["ÿ"]="\377";
64 amp["®"]="\256";amp["©"]="\251";amp["&[\043]163;"]="\243";
65 amp["­"]="-";
68 # Main
70 # Variable ``state'' is one if unresolved `<', zero otherwise.
72 {line="";errstr="";erra=0;errb=0;currsrch=1;txtbeg=1;
73 while (match(substr($0,currsrch),/[<>]/)!=0)
74 {currsrch=(currsrch+RSTART);
75 if (substr($0,(currsrch-1),1)=="<")
76 {if (state)
77 {if (!erra)
78 {errstr=(errstr "&&^Multiple `<' without `>' ERROR!, Ignoring^&&\n");
79 erra=1}}
80 else {if ((currsrch>length($0))||(substr($0,currsrch,1)~/^[ \t]$/))
81 {if (!errb)
82 {errstr=(errstr "&&^Whitespace after `<': Bad SGML syntax ERROR!, Ignoring^&&\n");
83 errb=1}}
84 else {if (currsrch>(txtbeg+1))
85 {line=(line substr($0,txtbeg,(currsrch-(txtbeg+1))))};
86 state=1}}}
87 else {if (substr($0,(currsrch-1),1)==">")
88 {if (state==0)
89 {continue} #`>' without `<'
90 else {txtbeg=currsrch;state=0;}}
91 else {print "Internal error, ignore"}}};
92 #At EOL:
93 if ((!state)&&(txtbeg<=length($0))) {line=(line substr($0,txtbeg))};
94 if (line~/&[\043]?[-0-9a-zA-Z.]*;/)
95 {for (x in amp) {gsub(x,amp[x],line);if (line!~/&/) {break}};
96 gsub(/&([\043]38|amp);/,"\\&",line)};
97 if ((line)||((!state)&&($0~/^$/)))
98 {if ((!state) || (errstr) || (line~/[ \t]$/))
99 {print line}
100 else {printf "%s",line}};
101 if (errstr) {printf "%s",errstr}}
103 #Minor bug: &g<X>t; will translate to a `>' character!
106 END{if (state) {print "&&^Was awaiting a `>' ERROR! at END^&&"}}
107 ##EOF