Merge pull request #2309 from mitza-oci/warnings
[ACE_TAO.git] / ACE / examples / Web_Crawler / URL_Visitor.cpp
blob8f6ed5af3e0f52092e8ca620751a84746bf61753
1 #include "ace/OS_NS_string.h"
2 #include "ace/Truncate.h"
3 #include "URL_Visitor.h"
4 #include "Command_Processor.h"
7 URL_Processing_Strategy::URL_Processing_Strategy (URL &url,
8 URL_Iterator &iterator)
9 : url_ (url),
10 iterator_ (iterator)
14 URL_Processing_Strategy::~URL_Processing_Strategy ()
18 int
19 URL_Processing_Strategy::destroy ()
21 // Commit suicide.
22 delete this;
23 return 0;
26 URL_Download_Strategy::URL_Download_Strategy (URL &url,
27 URL_Iterator &iterator)
28 : URL_Processing_Strategy (url, iterator)
32 int
33 URL_Download_Strategy::execute ()
35 ACE_CString buffer;
37 // Extract all the contents of the Stream and print them to the
38 // file.
39 while (this->iterator_.next (buffer) != 0)
40 ACE_DEBUG ((LM_DEBUG,
41 "%s",
42 buffer.c_str ()));
44 return 0;
47 HTTP_Header_Processing_Strategy::HTTP_Header_Processing_Strategy (URL &url,
48 URL_Iterator &iterator)
49 : URL_Processing_Strategy (url, iterator)
53 int
54 HTTP_Header_Processing_Strategy::execute ()
56 // Set the get() position.Necessary since later a peek is done.
57 if (this->url_.stream ().get_char () == 0)
58 ACE_ERROR_RETURN ((LM_ERROR,
59 "%p\n","Header Not Found"),
60 -1);
61 char line_buf[BUFSIZ + 1];
62 ACE_CString line (line_buf);
63 // Get the lines in the header iteratively and check for status info.
64 int result = 1, i = 0;
65 for (i = 0, result = this->iterator_.next (line);
66 result > 0;
67 ++i, result = this->iterator_.next (line))
69 if (i == 0)
71 // Assuming that the status-no is a space away.
72 int status_index =
73 ACE_Utils::truncate_cast<int> (line.find ("HTTP", 0));
74 ACE_CString status = line.substring (status_index + 9, //HTTP/1.1 200
75 3);
77 URL_Status *url_status = 0;
78 ACE_NEW_RETURN (url_status,
79 URL_Status,
80 0);
81 Auto_Destroyer<URL_Status> url_status_ptr (url_status);
82 url_status_ptr->status (ACE_OS::atoi (status.c_str ()));
83 this->url_.reply_status (**url_status_ptr);
84 // Invalid url.
85 if (url_status_ptr->status () != 200)
86 return -1;
88 else
90 if (line.find ("text/html") != ACE_CString::npos)
92 ACE_CString url_content_type("text/html");
93 this->url_.content_type (url_content_type);
97 return 0;
100 HTML_Body_Validation_Strategy::HTML_Body_Validation_Strategy (URL &url,
101 URL_Iterator &iterator,
102 URL_Validation_Visitor &context)
103 : URL_Processing_Strategy (url, iterator),
104 visitor_context_ (context)
109 HTML_Body_Validation_Strategy::execute ()
111 char host_name_buf[BUFSIZ + 1];
112 ACE_CString host_name (host_name_buf);
113 host_name.set (url_.url_addr ().get_host_name (),1);
115 // All to facilitate relative paths
116 char temp[BUFSIZ + 1];
117 ACE_CString prev_location (temp);
119 prev_location.set (ACE_TEXT_ALWAYS_CHAR (this->url_.url_addr ().get_path_name ()),
120 ACE_OS::strlen (this->url_.url_addr ().get_path_name ()),
122 int index =
123 ACE_Utils::truncate_cast<int> (
124 prev_location.rfind ('/', prev_location.length ()));
125 ACE_CString str = prev_location.substring (0, index + 1);
126 prev_location.set (str.c_str (), 1);
128 // Note: prev_location always ends with '/'
129 if (prev_location[0] != '/')
130 prev_location = "/" + prev_location;
132 // Build the url portion which can be attached to teh relative paths.
133 prev_location = host_name + prev_location;
135 char url_string[BUFSIZ + 1];
136 ACE_CString url (url_string);
138 while (this->iterator_.next (url) > 0)
140 // Check for relative urls.Strip out "http://" if its there.
141 if (url.find ("http") == url.npos)
143 if (url[0] == '.' && url[1] == '.')
145 url.set (&url[3], 1);
146 int i =
147 ACE_Utils::truncate_cast<int> (
148 prev_location.rfind ('/', prev_location.length () - 1));
149 prev_location = prev_location.substring (0, i+1);
151 if (url[0] == '.' && url[1] == '/')
152 url.set (&url[2], 1);
154 url = prev_location + url;
156 else
157 url.set (&url[7], 1);
158 // Double slash at the end works!e.g www.cs.wustl.edu/~kirthika//
159 if (url.find (".html") == url.npos)
160 url = url + "/";
162 // Create the new URL address.
163 ACE_URL_Addr *url_addr;
164 ACE_NEW_RETURN (url_addr,
165 ACE_URL_Addr,
167 Auto_Destroyer<ACE_URL_Addr> url_addr_ptr (url_addr);
168 if (url_addr_ptr->string_to_addr (ACE_TEXT_CHAR_TO_TCHAR (url.c_str ())) == 0)
170 HTTP_URL *http_url;
171 ACE_NEW_RETURN (http_url,
172 HTTP_URL (**url_addr_ptr,
173 dynamic_cast<HTTP_URL *> (&this->url_)),
175 URL_Command *url_command;
176 ACE_NEW_RETURN (url_command,
177 URL_Command (http_url),
180 OPTIONS::instance ()->command_processor ()->insert (url_command);
183 return 0;
186 URL_Iterator *
187 URL_Validation_Visitation_Strategy_Factory::make_header_iterator ()
189 URL_Iterator *i;
190 ACE_NEW_RETURN (i,
191 HTTP_Header_Iterator (*this->url_),
193 return i;
196 URL_Iterator *
197 URL_Validation_Visitation_Strategy_Factory::make_body_iterator ()
199 URL_Iterator *i;
200 ACE_NEW_RETURN (i,
201 HTML_Body_Iterator (*this->url_),
203 return i;
206 URL_Processing_Strategy *
207 URL_Validation_Visitation_Strategy_Factory::make_header_strategy (URL_Iterator &iterator)
209 URL_Processing_Strategy *ps;
210 ACE_NEW_RETURN (ps,
211 HTTP_Header_Processing_Strategy (*this->url_,
212 iterator),
214 return ps;
217 URL_Processing_Strategy *
218 URL_Validation_Visitation_Strategy_Factory::make_body_strategy (URL_Iterator &iterator)
220 URL_Processing_Strategy *ps;
221 ACE_NEW_RETURN (ps,
222 HTML_Body_Validation_Strategy (*this->url_,
223 iterator,
224 this->visitor_context_),
226 return ps;
230 URL_Validation_Visitation_Strategy_Factory::destroy ()
232 // Commit suicide.
233 delete this;
234 return 0;
237 URL_Visitor::~URL_Visitor ()
241 URL_Validation_Visitor::URL_Validation_Visitor ()
243 ACE_NEW (this->caching_connect_strategy_,
244 CACHED_CONNECT_STRATEGY (this->caching_strategy_));
245 ACE_NEW (this->strat_connector_,
246 STRATEGY_CONNECTOR(0,
247 &creation_strategy_,
248 caching_connect_strategy_,
249 &activation_strategy_));
250 if (strat_connector_ == 0)
251 ACE_ERROR ((LM_ERROR,
252 "%p %s\n"
253 "strategy connector creation failed"));
257 URL_Validation_Visitor::~URL_Validation_Visitor ()
259 this->strat_connector_ = 0;
260 if (this->caching_connect_strategy_ != 0)
261 delete this->caching_connect_strategy_;
264 URL_Validation_Visitor::URL_CACHE &
265 URL_Validation_Visitor::url_cache ()
267 return this->url_cache_;
271 URL_Validation_Visitor::in_cache (const ACE_URL_Addr &url_addr)
273 URL_Status reply_status (URL_Status::STATUS_CODE (1));
275 if (this->url_cache_.find (url_addr, reply_status) == 0)
277 ACE_DEBUG ((LM_DEBUG,
278 "status %d for URL %s (cached)\n",
279 reply_status.status (),
280 url_addr.addr_to_string (0)));
282 // Invalid status.
283 if (reply_status.status () != 200)
284 return -1;
286 return 1;
288 else
289 return 0;
292 URL_Visitation_Strategy_Factory *
293 URL_Validation_Visitor::make_visitation_strategy_factory (URL &url)
295 // Since this is HTTP 1.1 we'll need to establish a connection
296 // only once. Trying for relative paths.
298 if (url.stream ().open (this->strat_connector_,
299 url.url_addr ()) == -1)
300 return 0;
302 // See if we can get connected and send the GET request via the
303 // <HTTP_URL>.
304 int result = url.send_request ();
305 if (result == -1)
307 ACE_ERROR ((LM_ERROR,
308 "%p\n",
309 "send_request"));
310 if (this->url_cache_.bind (url.url_addr (),
311 URL_Status (URL_Status::STATUS_SERVICE_UNAVAILABLE)) == -1)
312 ACE_ERROR ((LM_ERROR,
313 "%p\n",
314 "bind"));
315 return 0;
317 // @@ Here's where we could check to see if the <url> was HTTP or
318 // FTP, etc. But for now we'll just assume that everything is an
319 // HTTP URL.
320 else
322 URL_Visitation_Strategy_Factory *vs;
323 ACE_NEW_RETURN (vs,
324 URL_Validation_Visitation_Strategy_Factory (&url,
325 *this),
327 return vs;
332 URL_Validation_Visitor::destroy ()
334 delete this->strat_connector_;
335 // Commit suicide.
336 delete this;
337 return 0;
341 URL_Validation_Visitor::visit (HTTP_URL &http_url)
343 int result = this->in_cache (http_url.url_addr ());
344 if (result == 0)
346 Auto_Destroyer <URL_Visitation_Strategy_Factory> vs (this->make_visitation_strategy_factory (http_url));
348 if (*vs == 0)
349 ACE_ERROR_RETURN ((LM_ERROR,
350 "%p\n",
351 "make_visitation_strategy_factory"),
352 -1);
354 Auto_Destroyer <URL_Iterator> ihs (vs->make_header_iterator ());
355 if (*ihs == 0)
356 ACE_ERROR_RETURN ((LM_ERROR,
357 "%p\n",
358 "make_header_iterator"),
359 -1);
360 Auto_Destroyer <URL_Processing_Strategy> phs (vs->make_header_strategy (**ihs));
361 if (*phs == 0)
362 ACE_ERROR_RETURN ((LM_ERROR,
363 "%p\n",
364 "make_header_strategy"),
365 -1);
366 int phs_result = phs->execute ();
367 if (phs_result == -1)
368 ACE_DEBUG ((LM_DEBUG,
369 "Invalid "));
371 ACE_DEBUG ((LM_DEBUG,
372 "URL with status %d %s\n",
373 http_url.reply_status ().status (),
374 http_url.url_addr().addr_to_string (0)));
376 // Store the http url in the cache.
377 if (this->url_cache ().bind (http_url.url_addr (),
378 http_url.reply_status ()) != 0)
379 ACE_ERROR_RETURN ((LM_ERROR,
380 "%p\n","url_cache.bind"),
381 -1);
383 // Since it is invalid dont go further.
384 if (phs_result == -1)
385 return 0;
387 // Get back if the recurse option isnt set.
388 if (OPTIONS::instance ()->recurse () != 1)
389 return 0;
391 Auto_Destroyer <URL_Iterator> is (vs->make_body_iterator ());
392 if (*is == 0)
393 ACE_ERROR_RETURN ((LM_ERROR,
394 "%p\n",
395 "make_body_iterator"),
396 -1);
398 Auto_Destroyer <URL_Processing_Strategy> ps (vs->make_body_strategy (**is));
399 if (*ps == 0)
400 ACE_ERROR_RETURN ((LM_ERROR,
401 "%p\n",
402 "make_body_strategy"),
403 -1);
405 if (ps->execute () == -1)
406 ACE_ERROR_RETURN ((LM_ERROR,
407 "%p\n",
408 "body execute"),
409 -1);
411 return 0;
415 URL_Download_Visitation_Strategy_Factory::destroy ()
417 // Commit suicide.
418 delete this;
419 return 0;
422 URL_Iterator *
423 URL_Download_Visitation_Strategy_Factory::make_header_iterator ()
425 return 0;
428 URL_Iterator *
429 URL_Download_Visitation_Strategy_Factory::make_body_iterator ()
431 URL_Iterator *i;
432 ACE_NEW_RETURN (i,
433 URL_Download_Iterator (*this->url_),
435 return i;
438 URL_Processing_Strategy *
439 URL_Download_Visitation_Strategy_Factory::make_header_strategy (URL_Iterator &iterator)
441 // You fill in here.
442 ACE_UNUSED_ARG (iterator);
444 return 0;
447 URL_Processing_Strategy *
448 URL_Download_Visitation_Strategy_Factory::make_body_strategy (URL_Iterator &iterator)
450 URL_Processing_Strategy *ps;
451 ACE_NEW_RETURN (ps,
452 URL_Download_Strategy (*this->url_,
453 iterator),
455 return ps;
458 URL_Visitation_Strategy_Factory::URL_Visitation_Strategy_Factory (URL *url)
459 : url_ (url)
463 URL_Visitation_Strategy_Factory::~URL_Visitation_Strategy_Factory ()
467 URL_Download_Visitation_Strategy_Factory::URL_Download_Visitation_Strategy_Factory (URL *url)
468 : URL_Visitation_Strategy_Factory (url)
472 URL_Validation_Visitation_Strategy_Factory::URL_Validation_Visitation_Strategy_Factory (URL *url,
473 URL_Validation_Visitor &visitor_context)
474 : URL_Visitation_Strategy_Factory (url),
475 visitor_context_ (visitor_context)
479 URL_Visitation_Strategy_Factory *
480 URL_Download_Visitor::make_visitation_strategy_factory (URL &url)
482 // See if we can get connected and send the GET request via the
483 // <HTTP_URL>.
484 while (1)
486 int retval = url.send_request ();
487 if (retval != -1)
488 break;
490 // @@ Here's where we could check to see if the <url> was HTTP or
491 // FTP, etc. But for now we'll just assume that everything is an
492 // HTTP URL.
493 URL_Visitation_Strategy_Factory *vs;
494 ACE_NEW_RETURN (vs,
495 URL_Download_Visitation_Strategy_Factory (&url),
497 return vs;
501 URL_Download_Visitor::destroy ()
503 // Commit suicide.
504 delete this;
505 return 0;
509 URL_Download_Visitor::visit (HTTP_URL &http_url)
511 Auto_Destroyer <URL_Visitation_Strategy_Factory> vs (this->make_visitation_strategy_factory (http_url));
513 if (*vs == 0)
514 ACE_ERROR_RETURN ((LM_ERROR,
515 "%p\n",
516 "make_visitation_strategy_factory"),
517 -1);
519 Auto_Destroyer <URL_Iterator> is (vs->make_body_iterator ());
520 if (*is == 0)
521 ACE_ERROR_RETURN ((LM_ERROR,
522 "%p\n",
523 "make_body_iterator"),
524 -1);
526 Auto_Destroyer <URL_Processing_Strategy> ps (vs->make_body_strategy (**is));
527 if (*ps == 0)
528 ACE_ERROR_RETURN ((LM_ERROR,
529 "%p\n",
530 "make_body_strategy"),
531 -1);
533 if (ps->execute () == -1)
534 ACE_ERROR_RETURN ((LM_ERROR,
535 "%p\n",
536 "body execute"),
537 -1);
538 return 0;