2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Does scraping for Firefox 2.0."""
12 from drivers
import keyboard
13 from drivers
import mouse
14 from drivers
import windowing
19 DEFAULT_PATH
= r
"c:\program files\mozilla firefox\firefox.exe"
21 # TODO(jhaas): the Firefox scraper is a bit rickety at the moment. Known
22 # issues: 1) won't work if the default profile puts toolbars in different
23 # locations, 2) uses sleep() statements rather than more robust checks,
24 # 3) fails badly if an existing Firefox window is open when the scrape
25 # is invoked. This needs to be fortified at some point.
28 """Invoke the Firefox browser and return the process and window.
31 path: full path to browser
34 A tuple of (process handle, render pane)
36 if not path
: path
= DEFAULT_PATH
39 (proc
, wnd
) = windowing
.InvokeAndWait(path
)
41 # Get the content pane
42 render_pane
= windowing
.FindChildWindow(
44 "MozillaWindowClass/MozillaWindowClass/MozillaWindowClass")
46 return (proc
, wnd
, render_pane
)
49 def InvokeBrowser(path
):
50 """Invoke the Firefox browser.
53 path: full path to browser
56 A tuple of (main window, process handle, render pane)
58 # Reuse an existing instance of the browser if we can find one. This
59 # may not work correctly, especially if the window is behind other windows.
60 wnds
= windowing
.FindChildWindows(0, "MozillaUIWindowClass")
66 (proc
, wnd
) = windowing
.InvokeAndWait(path
)
68 # Get the content pane
69 render_pane
= windowing
.FindChildWindow(
71 "MozillaWindowClass/MozillaWindowClass/MozillaWindowClass")
73 return (wnd
, proc
, render_pane
)
76 def Scrape(urls
, outdir
, size
, pos
, timeout
=20, **kwargs
):
77 """Invoke a browser, send it to a series of URLs, and save its output.
80 urls: list of URLs to scrape
81 outdir: directory to place output
82 size: size of browser window to use
83 pos: position of browser window
84 timeout: amount of time to wait for page to load
85 kwargs: miscellaneous keyword args
88 None if success, else an error string
90 if "path" in kwargs
and kwargs
["path"]: path
= kwargs
["path"]
91 else: path
= DEFAULT_PATH
93 (wnd
, proc
, render_pane
) = InvokeBrowser(path
)
95 # Resize and reposition the frame
96 windowing
.MoveAndSizeWindow(wnd
, pos
, size
, render_pane
)
100 # Firefox is a bit of a pain: it doesn't use standard edit controls,
101 # and it doesn't display a throbber when there's no tab. Let's make
102 # sure there's at least one tab, then select the first one
104 mouse
.ClickInWindow(wnd
)
105 keyboard
.TypeString("[t]", True)
106 mouse
.ClickInWindow(wnd
, (30, 115))
111 # Visit each URL we're given
112 if type(urls
) in types
.StringTypes
: urls
= [urls
]
116 # Use keyboard shortcuts
117 keyboard
.TypeString("{d}", True)
118 keyboard
.TypeString(url
)
119 keyboard
.TypeString("\n")
121 # Wait for the page to finish loading
122 load_time
= windowing
.WaitForThrobber(wnd
, (10, 96, 26, 112), timeout
)
123 timedout
= load_time
< 0
129 image
= windowing
.ScrapeWindow(render_pane
)
132 if "filename" in kwargs
:
133 if callable(kwargs
["filename"]):
134 filename
= kwargs
["filename"](url
)
136 filename
= kwargs
["filename"]
138 filename
= windowing
.URLtoFilename(url
, outdir
, ".bmp")
141 # Close all the tabs, cheesily
142 mouse
.ClickInWindow(wnd
)
144 while len(windowing
.FindChildWindows(0, "MozillaUIWindowClass")):
145 keyboard
.TypeString("[w]", True)
152 def Time(urls
, size
, timeout
, **kwargs
):
153 """Measure how long it takes to load each of a series of URLs
156 urls: list of URLs to time
157 size: size of browser window to use
158 timeout: amount of time to wait for page to load
159 kwargs: miscellaneous keyword args
162 A list of tuples (url, time). "time" can be "crashed" or "timeout"
164 if "path" in kwargs
and kwargs
["path"]: path
= kwargs
["path"]
165 else: path
= DEFAULT_PATH
168 # Visit each URL we're given
169 if type(urls
) in types
.StringTypes
: urls
= [urls
]
174 # Invoke the browser if necessary
176 (wnd
, proc
, render_pane
) = InvokeBrowser(path
)
178 # Resize and reposition the frame
179 windowing
.MoveAndSizeWindow(wnd
, (0,0), size
, render_pane
)
183 # Firefox is a bit of a pain: it doesn't use standard edit controls,
184 # and it doesn't display a throbber when there's no tab. Let's make
185 # sure there's at least one tab, then select the first one
187 mouse
.ClickInWindow(wnd
)
188 keyboard
.TypeString("[t]", True)
189 mouse
.ClickInWindow(wnd
, (30, 115))
192 # Use keyboard shortcuts
193 keyboard
.TypeString("{d}", True)
194 keyboard
.TypeString(url
)
195 keyboard
.TypeString("\n")
197 # Wait for the page to finish loading
198 load_time
= windowing
.WaitForThrobber(wnd
, (10, 96, 26, 112), timeout
)
199 timedout
= load_time
< 0
202 load_time
= "timeout"
204 # Try to close the browser; if this fails it's probably a crash
205 mouse
.ClickInWindow(wnd
)
208 while (len(windowing
.FindChildWindows(0, "MozillaUIWindowClass"))
210 keyboard
.TypeString("[w]", True)
214 if len(windowing
.FindChildWindows(0, "MozillaUIWindowClass")):
215 windowing
.EndProcess(proc
)
216 load_time
= "crashed"
219 except pywintypes
.error
:
221 load_time
= "crashed"
223 ret
.append( (url
, load_time
) )
227 while (len(windowing
.FindChildWindows(0, "MozillaUIWindowClass"))
229 keyboard
.TypeString("[w]", True)
236 # We're being invoked rather than imported, so run some tests
237 path
= r
"c:\sitecompare\scrapes\Firefox\2.0.0.6"
238 windowing
.PreparePath(path
)
240 # Scrape three sites and save the results
242 ["http://www.microsoft.com", "http://www.google.com",
243 "http://www.sun.com"],
244 path
, (1024, 768), (0, 0))
248 if __name__
== "__main__":