Misc/faq2html.py

   1 #! /usr/bin/env python
   2
   3 # A somewhat-generalized FAQ-to-HTML converter (by Ka-Ping Yee, 10 Sept 96)
   4
   5 # Reads a text file given on standard input or named as first argument, and
   6 # generates HTML 2.0 on standard output.  Recognizes these constructions:
   7 #
   8 #     HTML element               pattern at the beginning of a line
   9 #
  10 #     section heading            (<number><period>)+<space>
  11 #     numbered list element      <1-2 spaces>(<number><period>)+<space>
  12 #     unnumbered list element    <0-2 spaces><hyphen or asterisk><space>
  13 #     preformatted section       <more than two spaces>
  14 #
  15 # Heading level is determined by the number of (<number><period>) segments.
  16 # Blank lines force a separation of elements; if none of the above four
  17 # types is indicated, a new paragraph begins.  A line beginning with many
  18 # spaces is interpreted as a continuation (instead of preformatted) after
  19 # a list element.  Headings are anchored; paragraphs starting with "Q." are
  20 # emphasized, and those marked with "A." get their first sentence emphasized.
  21 #
  22 # Hyperlinks are created from references to:
  23 #     URLs, explicitly marked using <URL:scheme://host...>
  24 #     other questions, of the form "question <number>(<period><number>)*"
  25 #     sections, of the form "section <number>".
  26
  27 import sys, string, regex, regsub, regex_syntax
  28 regex.set_syntax(regex_syntax.RE_SYNTAX_AWK)
  29
  30 # --------------------------------------------------------- regular expressions
  31 orditemprog = regex.compile('  ?([1-9][0-9]*\.)+ +')
  32 itemprog = regex.compile(' ? ?[-*] +')
  33 headingprog = regex.compile('([1-9][0-9]*\.)+ +')
  34 prefmtprog = regex.compile('   ')
  35 blankprog = regex.compile('^[ \t\r\n]$')
  36 questionprog = regex.compile(' *Q\. +')
  37 answerprog = regex.compile(' *A\. +')
  38 sentprog = regex.compile('(([^.:;?!(]|[.:;?!][^ \t\r\n])+[.:;?!]?)')
  39
  40 mailhdrprog = regex.compile('^(Subject|Newsgroups|Followup-To|From|Reply-To'
  41     '|Approved|Archive-Name|Version|Last-Modified): +', regex.casefold)
  42 urlprog = regex.compile('&lt;URL:([^&]+)&gt;')
  43 addrprog = regex.compile('&lt;([^>@:]+@[^&@:]+)&gt;')
  44 qrefprog = regex.compile('question +([1-9](\.[0-9]+)*)')
  45 srefprog = regex.compile('section +([1-9][0-9]*)')
  46 entityprog = regex.compile('[&<>]')
  47
  48 # ------------------------------------------------------------ global variables
  49 body = []
  50 ollev = ullev = 0
  51 element = content = secnum = version = ''
  52
  53 # ----------------------------------------------------- for making nested lists
  54 def dnol():
  55     global body, ollev
  56     ollev = ollev + 1
  57     if body[-1] == '</li>': del body[-1]
  58     body.append('<ol>')
  59
  60 def upol():
  61     global body, ollev
  62     ollev = ollev - 1
  63     body.append(ollev and '</ol></li>' or '</ol>')
  64
  65 # --------------------------------- output one element and convert its contents
  66 def spew(clearol=0, clearul=0):
  67     global content, body, ollev, ullev
  68
  69     if content:
  70         if entityprog.search(content) > -1:
  71             content = regsub.gsub('&', '&amp;', content)
  72             content = regsub.gsub('<', '&lt;', content)
  73             content = regsub.gsub('>', '&gt;', content)
  74
  75         n = questionprog.match(content)
  76         if n > 0:
  77             content = '<em>' + content[n:] + '</em>'
  78             if ollev:                       # question reference in index
  79                 fragid = regsub.gsub('^ +|\.? +$', '', secnum)
  80                 content = '<a href="#%s">%s</a>' % (fragid, content)
  81
  82         if element[0] == 'h':               # heading in the main text
  83             fragid = regsub.gsub('^ +|\.? +$', '', secnum)
  84             content = secnum + '<a name="%s">%s</a>' % (fragid, content)
  85
  86         n = answerprog.match(content)
  87         if n > 0:                           # answer paragraph
  88             content = regsub.sub(sentprog, '<strong>\\1</strong>', content[n:])
  89
  90         body.append('<' + element + '>' + content)
  91         body.append('</' + element + '>')
  92         content = ''
  93
  94     while clearol and ollev: upol()
  95     if clearul and ullev: body.append('</ul>'); ullev = 0
  96
  97 # ---------------------------------------------------------------- main program
  98 faq = len(sys.argv)>1 and sys.argv[1] and open(sys.argv[1]) or sys.stdin
  99 lines = faq.readlines()
 100
 101 for line in lines:
 102     if line[2:9] == '=======':              # <hr> will appear *before*
 103         body.append('<hr>')                 # the underlined heading
 104         continue
 105
 106     n = orditemprog.match(line)
 107     if n > 0:                               # make ordered list item
 108         spew(0, 'clear ul')
 109         secnum = line[:n]
 110         level = string.count(secnum, '.')
 111         while level > ollev: dnol()
 112         while level < ollev: upol()
 113         element, content = 'li', line[n:]
 114         continue
 115
 116     n = itemprog.match(line)
 117     if n > 0:                               # make unordered list item
 118         spew('clear ol', 0)
 119         if ullev == 0: body.append('<ul>'); ullev = 1
 120         element, content = 'li', line[n:]
 121         continue
 122
 123     n = headingprog.match(line)
 124     if n > 0:                               # make heading element
 125         spew('clear ol', 'clear ul')
 126         secnum = line[:n]
 127         sys.stderr.write(line)
 128         element, content = 'h%d' % string.count(secnum, '.'), line[n:]
 129         continue
 130
 131     n = 0
 132     if not secnum:                          # haven't hit body yet
 133         n = mailhdrprog.match(line)
 134         v = version and -1 or regex.match('Version: ', line)
 135         if v > 0 and not version: version = line[v:]
 136     if n <= 0 and element != 'li':          # not pre if after a list item
 137         n = prefmtprog.match(line)
 138     if n > 0:                               # make preformatted element
 139         if element == 'pre':
 140             content = content + line
 141         else:
 142             spew('clear ol', 'clear ul')
 143             element, content = 'pre', line
 144         continue
 145
 146     if blankprog.match(line) > 0:           # force a new element
 147         spew()
 148         element = ''
 149     elif element:                           # continue current element
 150         content = content + line
 151     else:                                   # no element; make paragraph
 152         spew('clear ol', 'clear ul')
 153         element, content = 'p', line
 154
 155 spew()                                                                          # output last element
 156
 157 body = string.joinfields(body, '')
 158 body = regsub.gsub(urlprog, '<a href="\\1">\\1</a>', body)
 159 body = regsub.gsub(addrprog, '<a href="mailto:\\1">\\1</a>', body)
 160 body = regsub.gsub(qrefprog, '<a href="#\\1">question \\1</a>', body)
 161 body = regsub.gsub(srefprog, '<a href="#\\1">section \\1</a>', body)
 162
 163 print '<!doctype html public "-//IETF//DTD HTML 2.0//EN"><html>'
 164 print '<head><title>Python Frequently-Asked Questions v' + version
 165 print "</title></head><body>(This file was generated using Ping's"
 166 print '<a href="faq2html.py">faq2html.py</a>.)'
 167 print body + '</body></html>'