share/misc/nanpa.sed

   1 # $NetBSD: nanpa.sed,v 1.2 2006/12/25 18:39:48 wiz Exp $
   2 #
   3 # Parse HTML tables output by
   4 #   http://docs.nanpa.com/cgi-bin/npa_reports/nanpa
   5 # Specifically, for each html table row (TR),
   6 # print the <TD> elements separated by colons.
   7 #
   8 # This could break on HTML comments.
   9 #
  10 :top
  11 #                               Strip ^Ms
  12 s/\r//g
  13 #                               Join all lines with unterminated HTML tags
  14 /<[^>]*$/{
  15         N
  16         b top
  17 }
  18 #                               Replace all </TR> with EOL tag
  19 s;</[Tt][Rr]>;$;g
  20 #                               Join lines with only <TR>.
  21 /<[Tt][Rr][^>]*>$/{
  22         N
  23         s/\n//g
  24         b top
  25 }
  26 #                               Also, join all lines starting with <TR>.
  27 /<[TtRr][^>]*>[^$]*$/{
  28         N
  29         s/\n//g
  30         b top
  31 }
  32 #                               Remove EOL markers
  33 s/\$$//
  34 #                               Remove lines not starting with <TR>
  35 /<[Tt][Rr][^>]*>/!d
  36 #                               Replace all <TD> with colon
  37 s/[     ]*<TD[^>]*> */:/g
  38 #                               Strip all HTML tags
  39 s/<[^>]*>//g
  40 #                               Handle HTML characters
  41 s/&nbsp;/ /g
  42 #                               Compress spaces/tabs
  43 s/[     ][      ]*/ /g
  44 #                               Strip leading colons
  45 s/^://
  46 #                               Strip leading/trailing whitespace
  47 s/^ //
  48 s/ $//