Explicitly add python-numpy dependency to install-build-deps.
[chromium-blink-merge.git] / chrome / renderer / safe_browsing / scorer.cc
bloba8a23afd868d67cf84cfe2c4746d086495604299
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/renderer/safe_browsing/scorer.h"
7 #include <math.h>
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/metrics/histogram.h"
12 #include "base/strings/string_piece.h"
13 #include "chrome/common/safe_browsing/client_model.pb.h"
14 #include "chrome/renderer/safe_browsing/features.h"
16 namespace {
17 // Enum used to keep stats about the status of the Scorer creation.
18 enum ScorerCreationStatus {
19 SCORER_SUCCESS,
20 SCORER_FAIL_MODEL_OPEN_FAIL, // Not used anymore
21 SCORER_FAIL_MODEL_FILE_EMPTY, // Not used anymore
22 SCORER_FAIL_MODEL_FILE_TOO_LARGE, // Not used anymore
23 SCORER_FAIL_MODEL_PARSE_ERROR,
24 SCORER_FAIL_MODEL_MISSING_FIELDS,
25 SCORER_STATUS_MAX // Always add new values before this one.
28 void RecordScorerCreationStatus(ScorerCreationStatus status) {
29 UMA_HISTOGRAM_ENUMERATION("SBClientPhishing.ScorerCreationStatus",
30 status,
31 SCORER_STATUS_MAX);
33 } // namespace
35 namespace safe_browsing {
37 // Helper function which converts log odds to a probability in the range
38 // [0.0,1.0].
39 static double LogOdds2Prob(double log_odds) {
40 // 709 = floor(1023*ln(2)). 2**1023 is the largest finite double.
41 // Small log odds aren't a problem. as the odds will be 0. It's only
42 // when we get +infinity for the odds, that odds/(odds+1) would be NaN.
43 if (log_odds >= 709) {
44 return 1.0;
46 double odds = exp(log_odds);
47 return odds/(odds+1.0);
50 Scorer::Scorer() {}
51 Scorer::~Scorer() {}
53 /* static */
54 Scorer* Scorer::Create(const base::StringPiece& model_str) {
55 scoped_ptr<Scorer> scorer(new Scorer());
56 ClientSideModel& model = scorer->model_;
57 if (!model.ParseFromArray(model_str.data(), model_str.size())) {
58 DLOG(ERROR) << "Unable to parse phishing model. This Scorer object is "
59 << "invalid.";
60 RecordScorerCreationStatus(SCORER_FAIL_MODEL_PARSE_ERROR);
61 return NULL;
62 } else if (!model.IsInitialized()) {
63 DLOG(ERROR) << "Unable to parse phishing model. The model is missing "
64 << "some required fields. Maybe the .proto file changed?";
65 RecordScorerCreationStatus(SCORER_FAIL_MODEL_MISSING_FIELDS);
66 return NULL;
68 RecordScorerCreationStatus(SCORER_SUCCESS);
69 for (int i = 0; i < model.page_term_size(); ++i) {
70 scorer->page_terms_.insert(model.hashes(model.page_term(i)));
72 for (int i = 0; i < model.page_word_size(); ++i) {
73 scorer->page_words_.insert(model.page_word(i));
75 return scorer.release();
78 double Scorer::ComputeScore(const FeatureMap& features) const {
79 double logodds = 0.0;
80 for (int i = 0; i < model_.rule_size(); ++i) {
81 logodds += ComputeRuleScore(model_.rule(i), features);
83 return LogOdds2Prob(logodds);
86 int Scorer::model_version() const {
87 return model_.version();
90 const base::hash_set<std::string>& Scorer::page_terms() const {
91 return page_terms_;
94 const base::hash_set<uint32>& Scorer::page_words() const {
95 return page_words_;
98 size_t Scorer::max_words_per_term() const {
99 return model_.max_words_per_term();
102 uint32 Scorer::murmurhash3_seed() const {
103 return model_.murmur_hash_seed();
106 size_t Scorer::max_shingles_per_page() const {
107 return model_.max_shingles_per_page();
110 size_t Scorer::shingle_size() const {
111 return model_.shingle_size();
114 double Scorer::ComputeRuleScore(const ClientSideModel::Rule& rule,
115 const FeatureMap& features) const {
116 const base::hash_map<std::string, double>& feature_map = features.features();
117 double rule_score = 1.0;
118 for (int i = 0; i < rule.feature_size(); ++i) {
119 base::hash_map<std::string, double>::const_iterator it = feature_map.find(
120 model_.hashes(rule.feature(i)));
121 if (it == feature_map.end() || it->second == 0.0) {
122 // If the feature of the rule does not exist in the given feature map the
123 // feature weight is considered to be zero. If the feature weight is zero
124 // we leave early since we know that the rule score will be zero.
125 return 0.0;
127 rule_score *= it->second;
129 return rule_score * rule.weight();
131 } // namespace safe_browsing