flinkspkg/lynxDump.py

   1 # Part of flinks
   2 # (C) Martin Bays 2008
   3 # Released under the terms of the GPLv3
   4
   5 import sys, os
   6 from string import *
   7
   8 import re
   9
  10 import codecs
  11
  12 from constants import USER_AGENT
  13
  14 def lynxDump(url, lynxArgs=''):
  15     _, lynxStdout, lynxErrout = os.popen3('lynx -dump -useragent="%s via lynx" %s "%s"'
  16             % (USER_AGENT, lynxArgs, url))
  17
  18     # TODO: work out the encoding somehow? For now we assume it's latin1...
  19     lynxDecoded = codecs.EncodedFile(lynxStdout, 'utf8', 'latin1', 'replace')
  20
  21     dumped = ''
  22     refdumped = ''
  23     linkUrls = []
  24     readingRefs = False
  25     for line in lynxDecoded:
  26         if line == 'References\n':
  27             if readingRefs:
  28                 # The previous matched 'References' was part of the
  29                 # document...
  30                 dumped += refdumped
  31                 refdumped = ''
  32                 linkUrls = []
  33             readingRefs = True
  34
  35         if readingRefs:
  36             m = re.match(r'\s*\d+\. (.*)\n', line)
  37             if m:
  38                 linkUrls += [m.groups()[0]]
  39             refdumped += line
  40         else:
  41             dumped += line
  42
  43     lynxDecoded.close()
  44     lynxErr = lynxErrout.read()
  45     return dumped, linkUrls, lynxErr