rework default shortcuts
[flinks.git] / flinkspkg / lynxDump.py
blobc08bd4097c831cd4c3e6c26ea81a6b51f3793648
1 # Part of flinks
2 # (C) Martin Bays 2008
3 # Released under the terms of the GPLv3
5 import sys, os
6 from string import *
8 import re
10 import codecs
12 from constants import USER_AGENT
14 def lynxDump(url, lynxArgs=''):
15 _, lynxStdout, lynxErrout = os.popen3('lynx -dump -useragent="%s via lynx" %s "%s"'
16 % (USER_AGENT, lynxArgs, url))
18 # TODO: work out the encoding somehow? For now we assume it's latin1...
19 lynxDecoded = codecs.EncodedFile(lynxStdout, 'utf8', 'latin1', 'replace')
21 dumped = ''
22 refdumped = ''
23 linkUrls = []
24 readingRefs = False
25 for line in lynxDecoded:
26 if line == 'References\n':
27 if readingRefs:
28 # The previous matched 'References' was part of the
29 # document...
30 dumped += refdumped
31 refdumped = ''
32 linkUrls = []
33 readingRefs = True
35 if readingRefs:
36 m = re.match(r'\s*\d+\. (.*)\n', line)
37 if m:
38 linkUrls += [m.groups()[0]]
39 refdumped += line
40 else:
41 dumped += line
43 lynxDecoded.close()
44 lynxErr = lynxErrout.read()
45 return dumped, linkUrls, lynxErr