2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
7 from HTMLParser
import HTMLParser
, HTMLParseError
10 from PlannerExceptions
import *
13 def __init__(self
, start
, end
, time
, html
):
19 itinParser
= ItineraryParser()
21 # Extract the HTML chunk for the actual itinerary, and feed it to
23 itinParser
.feed(self
._extractItin
(html
))
25 self
.entries
= itinParser
.entries
27 def anyUnparsed(self
):
28 for ie
in self
.entries
:
29 if ie
.type == TYPE_UNKNOWN
:
33 def _extractItin(self
, html
):
34 match
= _itin_rx
.search(html
)
36 raise ItineraryParseException("Failed to extract itinerary")
37 return match
.group("itin")
39 # Matches the entire itinerary, pulling the table into group "itin"
40 _itin_re
= ('<!-- begin graphical itin.*?>\s*'
41 # ?s: DOTALL: . matches \n
42 '(?s)(?P<itin><table.*</table>)\s*'
43 '<!-- end graphical itin')
44 _itin_rx
= re
.compile(_itin_re
)
48 class ItineraryParser(HTMLParser
):
50 HTMLParser
.__init
__(self
)
51 self
.state
= STATE_NOTHING
56 # Most recent ItinEntry, to which a bus stop number may be added
59 def handle_starttag(self
, tag
, attrs
):
60 # The directions are contained in <td class="itinText...">.
61 if tag
== "td" and attrStarts(attrs
, "class", "itinText"):
65 self
.state
= STATE_IN_ITIN_TEXT
67 # The stop numbers are inside a block like this:
68 # <span class="itinBusStop">O-TRAIN CARLETON S.</span>
69 # <span class="itinBusStop"><a href="#" onClick="MM_openBrWindow('get.stop.timetable.oci?sptDate=2005-06-05&stop560=3062&stopLabel=CG995&stopName=O-TRAIN%2520CARLETON%2520S.','_stopWindow','scrollbars=yes,resizable=yes,width=800,height=600')">(3062)</a></span>
71 # We'll save the data in the <a> tag.
72 elif self
.state
== STATE_AFTER_ITIN_TEXT \
73 and tag
== "span" and attrStarts(attrs
, "class", "itinBusStop"):
76 self
.state
= STATE_IN_BUS_STOP
78 elif self
.state
== STATE_IN_ITIN_TEXT
and tag
== "br":
81 def handle_endtag(self
, tag
):
82 if self
.state
== STATE_IN_ITIN_TEXT
and tag
== "td":
84 self
.state
= STATE_AFTER_ITIN_TEXT
87 elif self
.state
== STATE_IN_BUS_STOP
and tag
== "span":
88 if self
._saveBusStop
():
89 self
.state
= STATE_NOTHING
91 # Usually there are two itinBusStop spans in a row, and
92 # this was probably the first one.
93 self
.state
= STATE_AFTER_ITIN_TEXT
95 def handle_data(self
, data
):
96 if self
.state
in (STATE_IN_ITIN_TEXT
, STATE_IN_BUS_STOP
):
99 def _saveTextEntry(self
):
100 ie
= self
._buildItinEntry
(self
.data
.strip())
101 self
.entries
.append(ie
)
104 def _saveBusStop(self
):
105 match
= _bus_stop_rx
.search(self
.data
)
107 self
.lastIE
.busStop
= match
.group("stopnum")
110 def _buildItinEntry(self
, text
):
113 # Match each regexp in turn.
115 (TYPE_DEPART_TIME
, _depart_rx
),
116 (TYPE_WAIT
, _wait_rx
),
117 (TYPE_WALK_TO_STOP
, _walk_to_stop_rx
),
118 (TYPE_WALK_TO_TRANSFER
, _walk_transfer_stop_rx
),
119 (TYPE_TAKE_BUS
, _take_bus_rx
),
120 (TYPE_WALK_TO_DEST
, _walk_to_dest_rx
),
122 for entry
in matchTypes
:
123 match
= entry
[1].search(text
)
127 # Extract fields, if any.
128 groups
= match
.groupdict()
129 for grp
in groups
.iteritems():
130 setattr(ie
, grp
[0], grp
[1])
135 # ItineraryParser is an FSM.
137 STATE_IN_ITIN_TEXT
= 1
138 STATE_AFTER_ITIN_TEXT
= 2
139 STATE_IN_BUS_STOP
= 3
142 def attrStarts(attrs
, name
, val
):
143 """Searches for attribute "name" in the attrs list. If it finds it,
144 returns true if its value starts with "val". Case insensitive."""
150 # The attribute name is already lowercased by HTMLParser;
151 # make the value lowercase, too.
152 if kv
[0] == name
and kv
[1].lower().startswith(val
):
156 _bus_stop_re
= '\((?P<stopnum>\d{4})\)'
157 _bus_stop_rx
= re
.compile(_bus_stop_re
)
160 # Expressions we'll use to narrow an ItinEntry down to a specific type.
161 # We'll also pick out data fields where possible.
162 _ie_time_re
= '[\d: APM]+'
164 _depart_re
= '^Depart at (?P<startTime>' + _ie_time_re
+ ')$'
165 _depart_rx
= re
.compile(_depart_re
)
167 _walk_to_stop_re
= ('^At (?P<startTime>' + _ie_time_re
168 + '),\s*walk to (?:stop|station)\s*(?P<destination>.*?)\s*'
169 '\((?P<duration>[\d]+)\s*min')
170 _walk_to_stop_rx
= re
.compile(_walk_to_stop_re
)
172 # If the trip includes a transfer through a nearby stop, you'll get this.
173 _walk_transfer_stop_re
= ('(?i)^Walk to (?:stop|station)\s*'
174 '(?P<destination>.*?)\s*'
176 _walk_transfer_stop_rx
= re
.compile(_walk_transfer_stop_re
)
178 # Flag 's' (DOTALL) is set on this one so we can skip over newlines in
179 # the description after "get off at stop <destination>". This description
180 # could be of the form "1 station(s) further" or
181 # "street PRESTON following street GLADSTONE.\nLast intersections: ... AVE."
182 _take_bus_re
= ('(?is)^At (?P<startTime>' + _ie_time_re
183 + '),\s*take (?:train|bus)\s+route\s+(?P<route>.*?)\s*direction'
184 '\s*(?P<direction>.*?)\s*and get off at (?:stop|station)\s*'
185 '(?P<destination>.*?)(?:\s*, .*)?\.\s*Arrive at'
186 '\s*(?P<endTime>' + _ie_time_re
+ ')\.')
187 _take_bus_rx
= re
.compile(_take_bus_re
)
189 _wait_re
= ('(?i)^Wait (?P<duration>\d+) min')
190 _wait_rx
= re
.compile(_wait_re
)
192 # If Chris asks to go from 916 to 918 meadowlands, it returns one step that
193 # begins with "At 1:45 PM, ...". Otherwise, the "At ... " clause isn't there.
194 _walk_to_dest_re
= ('(?i)^(?:At (?P<startTime>' + _ie_time_re
+ '),\s*)?'
195 'Walk to (?P<destination>.*)\.\s+Arrive at'
196 '\s*(?P<endTime>' + _ie_time_re
+ ')\s+'
197 '\((?P<duration>\d+) min')
198 _walk_to_dest_rx
= re
.compile(_walk_to_dest_re
)
201 """One step in the plan.
205 text: Original text from the planner.
206 duration: Duration of the step in minutes.
207 startTime: Start time as a string.
208 endTime: End time as a string.
209 route: Bus or train route.
210 direction: Bus or train direction.
211 destination: A string. Usually a bus stop at the end of a step.
212 busStop: Bus stop number associated with the destination."""
214 def __init__(self
, text
):
217 self
.type = TYPE_UNKNOWN
219 self
.startTime
= None
222 self
.direction
= None
223 self
.destination
= None
227 return "<T%d %s (%s)>" % (self
.type, self
.text
, self
.busStop
)
229 return "<T%d %s>" % (self
.type, self
.text
)
232 return self
.__str
__()
237 TYPE_WALK_TO_STOP
= 2
240 TYPE_WALK_TO_DEST
= 5
241 TYPE_WALK_TO_TRANSFER
= 6