1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // PhishingUrlFeatureExtractor handles computing URL-based features for
6 // the client-side phishing detection model. These include tokens in the
7 // host and path, features pertaining to host length, and IP addresses.
9 #ifndef CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
10 #define CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_
15 #include "base/basictypes.h"
19 namespace safe_browsing
{
22 class PhishingUrlFeatureExtractor
{
24 PhishingUrlFeatureExtractor();
25 ~PhishingUrlFeatureExtractor();
27 // Extracts features for |url| into the given feature map.
28 // Returns true on success.
29 bool ExtractFeatures(const GURL
& url
, FeatureMap
* features
);
32 friend class PhishingUrlFeatureExtractorTest
;
34 static const size_t kMinPathComponentLength
= 3;
36 // Given a string, finds all substrings of consecutive alphanumeric
37 // characters of length >= kMinPathComponentLength and inserts them into
39 static void SplitStringIntoLongAlphanumTokens(
40 const std::string
& full
,
41 std::vector
<std::string
>* tokens
);
43 DISALLOW_COPY_AND_ASSIGN(PhishingUrlFeatureExtractor
);
46 } // namespace safe_browsing
48 #endif // CHROME_RENDERER_SAFE_BROWSING_PHISHING_URL_FEATURE_EXTRACTOR_H_