1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/scorer.h"
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/metrics/histogram.h"
12 #include "base/strings/string_piece.h"
13 #include "chrome/common/safe_browsing/client_model.pb.h"
14 #include "chrome/renderer/safe_browsing/features.h"
17 // Enum used to keep stats about the status of the Scorer creation.
18 enum ScorerCreationStatus
{
20 SCORER_FAIL_MODEL_OPEN_FAIL
, // Not used anymore
21 SCORER_FAIL_MODEL_FILE_EMPTY
, // Not used anymore
22 SCORER_FAIL_MODEL_FILE_TOO_LARGE
, // Not used anymore
23 SCORER_FAIL_MODEL_PARSE_ERROR
,
24 SCORER_FAIL_MODEL_MISSING_FIELDS
,
25 SCORER_STATUS_MAX
// Always add new values before this one.
28 void RecordScorerCreationStatus(ScorerCreationStatus status
) {
29 UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.ScorerCreationStatus",
35 namespace safe_browsing
{
37 // Helper function which converts log odds to a probability in the range
39 static double LogOdds2Prob(double log_odds
) {
40 // 709 = floor(1023*ln(2)). 2**1023 is the largest finite double.
41 // Small log odds aren't a problem. as the odds will be 0. It's only
42 // when we get +infinity for the odds, that odds/(odds+1) would be NaN.
43 if (log_odds
>= 709) {
46 double odds
= exp(log_odds
);
47 return odds
/(odds
+1.0);
54 Scorer
* Scorer::Create(const base::StringPiece
& model_str
) {
55 scoped_ptr
<Scorer
> scorer(new Scorer());
56 ClientSideModel
& model
= scorer
->model_
;
57 if (!model
.ParseFromArray(model_str
.data(), model_str
.size())) {
58 DLOG(ERROR
) << "Unable to parse phishing model. This Scorer object is "
60 RecordScorerCreationStatus(SCORER_FAIL_MODEL_PARSE_ERROR
);
62 } else if (!model
.IsInitialized()) {
63 DLOG(ERROR
) << "Unable to parse phishing model. The model is missing "
64 << "some required fields. Maybe the .proto file changed?";
65 RecordScorerCreationStatus(SCORER_FAIL_MODEL_MISSING_FIELDS
);
68 RecordScorerCreationStatus(SCORER_SUCCESS
);
69 for (int i
= 0; i
< model
.page_term_size(); ++i
) {
70 scorer
->page_terms_
.insert(model
.hashes(model
.page_term(i
)));
72 for (int i
= 0; i
< model
.page_word_size(); ++i
) {
73 scorer
->page_words_
.insert(model
.page_word(i
));
75 return scorer
.release();
78 double Scorer::ComputeScore(const FeatureMap
& features
) const {
80 for (int i
= 0; i
< model_
.rule_size(); ++i
) {
81 logodds
+= ComputeRuleScore(model_
.rule(i
), features
);
83 return LogOdds2Prob(logodds
);
86 int Scorer::model_version() const {
87 return model_
.version();
90 const base::hash_set
<std::string
>& Scorer::page_terms() const {
94 const base::hash_set
<uint32
>& Scorer::page_words() const {
98 size_t Scorer::max_words_per_term() const {
99 return model_
.max_words_per_term();
102 uint32
Scorer::murmurhash3_seed() const {
103 return model_
.murmur_hash_seed();
106 size_t Scorer::max_shingles_per_page() const {
107 return model_
.max_shingles_per_page();
110 size_t Scorer::shingle_size() const {
111 return model_
.shingle_size();
114 double Scorer::ComputeRuleScore(const ClientSideModel::Rule
& rule
,
115 const FeatureMap
& features
) const {
116 const base::hash_map
<std::string
, double>& feature_map
= features
.features();
117 double rule_score
= 1.0;
118 for (int i
= 0; i
< rule
.feature_size(); ++i
) {
119 base::hash_map
<std::string
, double>::const_iterator it
= feature_map
.find(
120 model_
.hashes(rule
.feature(i
)));
121 if (it
== feature_map
.end() || it
->second
== 0.0) {
122 // If the feature of the rule does not exist in the given feature map the
123 // feature weight is considered to be zero. If the feature weight is zero
124 // we leave early since we know that the rule score will be zero.
127 rule_score
*= it
->second
;
129 return rule_score
* rule
.weight();
131 } // namespace safe_browsing