Roll src/third_party/WebKit 3aea697:d9c6159 (svn 201973:201974)
[chromium-blink-merge.git] / components / dom_distiller / core / page_features.cc
blobf931bbe5cb23cdc75513005d00cac20c84b92b93
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/dom_distiller/core/page_features.h"
7 #include <string>
9 #include "base/json/json_reader.h"
10 #include "third_party/re2/re2/re2.h"
12 namespace dom_distiller {
13 /* This code needs to derive features in the same way and order in which they
14 * are derived when training the model. Parts of that code are reproduced in the
15 * comments below.
18 namespace {
20 std::string GetLastSegment(const std::string& path) {
21 // return re.search('[^/]*\/?$', path).group(0)
22 if (path.size() == 0)
23 return "";
24 size_t start = path.rfind("/", path.size() - 1);
25 return start == std::string::npos ? "" : path.substr(start + 1);
28 int CountMatches(const std::string& s, const std::string& p) {
29 // return len(re.findall(p, s))
30 re2::StringPiece sp(s);
31 re2::RE2 regexp(p);
32 int count = 0;
33 while (re2::RE2::FindAndConsume(&sp, regexp))
34 count++;
35 return count;
38 int GetWordCount(const std::string& s) {
39 return CountMatches(s, "\\w+");
42 bool Contains(const std::string& n, const std::string& h) {
43 return h.find(n) != std::string::npos;
46 bool EndsWith(const std::string& t, const std::string& s) {
47 return s.size() >= t.size() &&
48 s.compare(s.size() - t.size(), std::string::npos, t) == 0;
51 } // namespace
53 int kDerivedFeaturesCount = 29;
55 std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
56 const GURL& url,
57 double numElements,
58 double numAnchors,
59 double numForms,
60 const std::string& innerText,
61 const std::string& textContent,
62 const std::string& innerHTML) {
63 // In the training pipeline, the strings are explicitly encoded in utf-8 (as
64 // they are here).
65 const std::string& path = url.path();
66 int innerTextWords = GetWordCount(innerText);
67 int textContentWords = GetWordCount(textContent);
68 int innerHTMLWords = GetWordCount(innerHTML);
69 std::vector<double> features;
70 // 'opengraph', opengraph,
71 features.push_back(isOGArticle);
72 // 'forum', 'forum' in path,
73 features.push_back(Contains("forum", path));
74 // 'index', 'index' in path,
75 features.push_back(Contains("index", path));
76 // 'view', 'view' in path,
77 features.push_back(Contains("view", path));
78 // 'asp', '.asp' in path,
79 features.push_back(Contains(".asp", path));
80 // 'phpbb', 'phpbb' in path,
81 features.push_back(Contains("phpbb", path));
82 // 'php', path.endswith('.php'),
83 features.push_back(EndsWith(".php", path));
84 // 'pathlength', len(path),
85 features.push_back(path.size());
86 // 'domain', len(path) < 2,
87 features.push_back(path.size() < 2);
88 // 'pathcomponents', CountMatches(path, r'\/.'),
89 features.push_back(CountMatches(path, "\\/."));
90 // 'slugdetector', CountMatches(path, r'[^\w/]'),
91 features.push_back(CountMatches(path, "[^\\w/]"));
92 // 'pathnumbers', CountMatches(path, r'\d+'),
93 features.push_back(CountMatches(path, "\\d+"));
94 // 'lastSegmentLength', len(GetLastSegment(path)),
95 features.push_back(GetLastSegment(path).size());
96 // 'formcount', numForms,
97 features.push_back(numForms);
98 // 'anchorcount', numAnchors,
99 features.push_back(numAnchors);
100 // 'elementcount', numElements,
101 features.push_back(numElements);
102 // 'anchorratio', float(numAnchors) / max(1, numElements),
103 features.push_back(double(numAnchors) / std::max<double>(1, numElements));
104 // 'innertextlength', len(innerText),
105 features.push_back(innerText.size());
106 // 'textcontentlength', len(textContent),
107 features.push_back(textContent.size());
108 // 'innerhtmllength', len(innerHTML),
109 features.push_back(innerHTML.size());
110 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
111 features.push_back(double(innerText.size()) /
112 std::max<double>(1.0, innerHTML.size()));
113 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
114 features.push_back(double(textContent.size()) /
115 std::max<double>(1.0, innerHTML.size()));
116 // 'innertexttextcontentlengthratio',
117 // float(len(innerText)) / max(1, len(textContent)),
118 features.push_back(double(innerText.size()) /
119 std::max<double>(1.0, textContent.size()));
120 // 'innertextwordcount', innerTextWords,
121 features.push_back(innerTextWords);
122 // 'textcontentwordcount', textContentWords,
123 features.push_back(textContentWords);
124 // 'innerhtmlwordcount', innerHTMLWords,
125 features.push_back(innerHTMLWords);
126 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
127 features.push_back(double(innerTextWords) /
128 std::max<int>(1.0, innerHTMLWords));
129 // 'textcontentwordcountratio',
130 // float(textContentWords) / max(1, innerHTMLWords),
131 features.push_back(double(textContentWords) /
132 std::max<int>(1.0, innerHTMLWords));
133 // 'innertexttextcontentwordcountratio',
134 // float(innerTextWords) / max(1, textContentWords),
135 features.push_back(double(innerTextWords) /
136 std::max<int>(1.0, textContentWords));
137 return features;
140 std::vector<double> CalculateDerivedFeaturesFromJSON(
141 const base::Value* stringified_json) {
142 std::string stringified;
143 if (!stringified_json->GetAsString(&stringified)) {
144 return std::vector<double>();
147 scoped_ptr<base::Value> json = base::JSONReader::Read(stringified);
148 if (!json) {
149 return std::vector<double>();
152 const base::DictionaryValue* dict;
153 if (!json->GetAsDictionary(&dict)) {
154 return std::vector<double>();
157 bool isOGArticle = false;
158 std::string url, innerText, textContent, innerHTML;
159 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
161 if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
162 dict->GetString("url", &url) &&
163 dict->GetDouble("numElements", &numElements) &&
164 dict->GetDouble("numAnchors", &numAnchors) &&
165 dict->GetDouble("numForms", &numForms) &&
166 dict->GetString("innerText", &innerText) &&
167 dict->GetString("textContent", &textContent) &&
168 dict->GetString("innerHTML", &innerHTML))) {
169 return std::vector<double>();
172 GURL parsed_url(url);
173 if (!parsed_url.is_valid()) {
174 return std::vector<double>();
177 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
178 numAnchors, numForms, innerText, textContent,
179 innerHTML);
182 } // namespace dom_distiller