1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // This class loads a client-side model and lets you compute a phishing score
6 // for a set of previously extracted features. The phishing score corresponds
7 // to the probability that the features are indicative of a phishing site.
9 // For more details on how the score is actually computed for a given model
10 // and a given set of features read the comments in client_model.proto file.
12 // See features.h for a list of features that are currently used.
14 #ifndef CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
15 #define CHROME_RENDERER_SAFE_BROWSING_SCORER_H_
19 #include "base/basictypes.h"
20 #include "base/containers/hash_tables.h"
21 #include "base/strings/string_piece.h"
22 #include "chrome/common/safe_browsing/client_model.pb.h"
24 namespace safe_browsing
{
27 // Scorer methods are virtual to simplify mocking of this class.
32 // Factory method which creates a new Scorer object by parsing the given
33 // model. If parsing fails this method returns NULL.
34 static Scorer
* Create(const base::StringPiece
& model_str
);
36 // This method computes the probability that the given features are indicative
37 // of phishing. It returns a score value that falls in the range [0.0,1.0]
38 // (range is inclusive on both ends).
39 virtual double ComputeScore(const FeatureMap
& features
) const;
41 // Returns the version number of the loaded client model.
42 int model_version() const;
44 // -- Accessors used by the page feature extractor ---------------------------
46 // Returns a set of hashed page terms that appear in the model in binary
48 const base::hash_set
<std::string
>& page_terms() const;
50 // Returns a set of hashed page words that appear in the model in binary
52 const base::hash_set
<uint32
>& page_words() const;
54 // Return the maximum number of words per term for the loaded model.
55 size_t max_words_per_term() const;
57 // Returns the murmurhash3 seed for the loaded model.
58 uint32
murmurhash3_seed() const;
60 // Return the maximum number of unique shingle hashes per page.
61 size_t max_shingles_per_page() const;
63 // Return the number of words in a shingle.
64 size_t shingle_size() const;
67 // Most clients should use the factory method. This constructor is public
68 // to allow for mock implementations.
72 friend class PhishingScorerTest
;
74 // Computes the score for a given rule and feature map. The score is computed
75 // by multiplying the rule weight with the product of feature weights for the
76 // given rule. The feature weights are stored in the feature map. If a
77 // particular feature does not exist in the feature map we set its weight to
79 double ComputeRuleScore(const ClientSideModel::Rule
& rule
,
80 const FeatureMap
& features
) const;
82 ClientSideModel model_
;
83 base::hash_set
<std::string
> page_terms_
;
84 base::hash_set
<uint32
> page_words_
;
86 DISALLOW_COPY_AND_ASSIGN(Scorer
);
88 } // namespace safe_browsing
90 #endif // CHROME_RENDERER_SAFE_BROWSING_SCORER_H_