Add abhijeet.k@samsung.com to AUTHORS list.
[chromium-blink-merge.git] / components / dom_distiller / core / page_features.cc
blob05405886a01fc33dbd2fbc63fb63f71e4f5b009e
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/dom_distiller/core/page_features.h"
7 #include <string>
9 #include "base/json/json_reader.h"
10 #include "third_party/re2/re2/re2.h"
12 namespace dom_distiller {
13 /* This code needs to derive features in the same way and order in which they
14 * are derived when training the model. Parts of that code are reproduced in the
15 * comments below.
18 namespace {
19 std::string GetLastSegment(const std::string& path) {
20 // return re.search('[^/]*\/?$', path).group(0)
21 if (path.size() == 0)
22 return "";
23 size_t start = path.rfind("/", path.size() - 1);
24 return start == std::string::npos ? "" : path.substr(start + 1);
27 int CountMatches(const std::string& s, const std::string& p) {
28 // return len(re.findall(p, s))
29 re2::StringPiece sp(s);
30 re2::RE2 regexp(p);
31 int count = 0;
32 while (re2::RE2::FindAndConsume(&sp, regexp))
33 count++;
34 return count;
37 int GetWordCount(const std::string& s) {
38 return CountMatches(s, "\\w+");
41 bool Contains(const std::string& n, const std::string& h) {
42 return h.find(n) != std::string::npos;
45 bool EndsWith(const std::string& t, const std::string& s) {
46 return s.size() >= t.size() &&
47 s.compare(s.size() - t.size(), std::string::npos, t) == 0;
51 int kDerivedFeaturesCount = 29;
53 std::vector<double> CalculateDerivedFeatures(bool isOGArticle,
54 const GURL& url,
55 double numElements,
56 double numAnchors,
57 double numForms,
58 const std::string& innerText,
59 const std::string& textContent,
60 const std::string& innerHTML) {
61 // In the training pipeline, the strings are explicitly encoded in utf-8 (as
62 // they are here).
63 const std::string& path = url.path();
64 int innerTextWords = GetWordCount(innerText);
65 int textContentWords = GetWordCount(textContent);
66 int innerHTMLWords = GetWordCount(innerHTML);
67 std::vector<double> features;
68 // 'opengraph', opengraph,
69 features.push_back(isOGArticle);
70 // 'forum', 'forum' in path,
71 features.push_back(Contains("forum", path));
72 // 'index', 'index' in path,
73 features.push_back(Contains("index", path));
74 // 'view', 'view' in path,
75 features.push_back(Contains("view", path));
76 // 'asp', '.asp' in path,
77 features.push_back(Contains(".asp", path));
78 // 'phpbb', 'phpbb' in path,
79 features.push_back(Contains("phpbb", path));
80 // 'php', path.endswith('.php'),
81 features.push_back(EndsWith(".php", path));
82 // 'pathlength', len(path),
83 features.push_back(path.size());
84 // 'domain', len(path) < 2,
85 features.push_back(path.size() < 2);
86 // 'pathcomponents', CountMatches(path, r'\/.'),
87 features.push_back(CountMatches(path, "\\/."));
88 // 'slugdetector', CountMatches(path, r'[^\w/]'),
89 features.push_back(CountMatches(path, "[^\\w/]"));
90 // 'pathnumbers', CountMatches(path, r'\d+'),
91 features.push_back(CountMatches(path, "\\d+"));
92 // 'lastSegmentLength', len(GetLastSegment(path)),
93 features.push_back(GetLastSegment(path).size());
94 // 'formcount', numForms,
95 features.push_back(numForms);
96 // 'anchorcount', numAnchors,
97 features.push_back(numAnchors);
98 // 'elementcount', numElements,
99 features.push_back(numElements);
100 // 'anchorratio', float(numAnchors) / max(1, numElements),
101 features.push_back(double(numAnchors) / std::max<double>(1, numElements));
102 // 'innertextlength', len(innerText),
103 features.push_back(innerText.size());
104 // 'textcontentlength', len(textContent),
105 features.push_back(textContent.size());
106 // 'innerhtmllength', len(innerHTML),
107 features.push_back(innerHTML.size());
108 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
109 features.push_back(double(innerText.size()) /
110 std::max<double>(1.0, innerHTML.size()));
111 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
112 features.push_back(double(textContent.size()) /
113 std::max<double>(1.0, innerHTML.size()));
114 // 'innertexttextcontentlengthratio',
115 // float(len(innerText)) / max(1, len(textContent)),
116 features.push_back(double(innerText.size()) /
117 std::max<double>(1.0, textContent.size()));
118 // 'innertextwordcount', innerTextWords,
119 features.push_back(innerTextWords);
120 // 'textcontentwordcount', textContentWords,
121 features.push_back(textContentWords);
122 // 'innerhtmlwordcount', innerHTMLWords,
123 features.push_back(innerHTMLWords);
124 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
125 features.push_back(double(innerTextWords) /
126 std::max<int>(1.0, innerHTMLWords));
127 // 'textcontentwordcountratio',
128 // float(textContentWords) / max(1, innerHTMLWords),
129 features.push_back(double(textContentWords) /
130 std::max<int>(1.0, innerHTMLWords));
131 // 'innertexttextcontentwordcountratio',
132 // float(innerTextWords) / max(1, textContentWords),
133 features.push_back(double(innerTextWords) /
134 std::max<int>(1.0, textContentWords));
135 return features;
138 std::vector<double> CalculateDerivedFeaturesFromJSON(
139 const base::Value* stringified_json) {
140 std::string stringified;
141 if (!stringified_json->GetAsString(&stringified)) {
142 return std::vector<double>();
145 scoped_ptr<base::Value> json(base::JSONReader::Read(stringified));
146 if (!json) {
147 return std::vector<double>();
150 const base::DictionaryValue* dict;
151 if (!json->GetAsDictionary(&dict)) {
152 return std::vector<double>();
155 bool isOGArticle = false;
156 std::string url, innerText, textContent, innerHTML;
157 double numElements = 0.0, numAnchors = 0.0, numForms = 0.0;
159 if (!(dict->GetBoolean("opengraph", &isOGArticle) &&
160 dict->GetString("url", &url) &&
161 dict->GetDouble("numElements", &numElements) &&
162 dict->GetDouble("numAnchors", &numAnchors) &&
163 dict->GetDouble("numForms", &numForms) &&
164 dict->GetString("innerText", &innerText) &&
165 dict->GetString("textContent", &textContent) &&
166 dict->GetString("innerHTML", &innerHTML))) {
167 return std::vector<double>();
170 GURL parsed_url(url);
171 if (!parsed_url.is_valid()) {
172 return std::vector<double>();
175 return CalculateDerivedFeatures(isOGArticle, parsed_url, numElements,
176 numAnchors, numForms, innerText, textContent,
177 innerHTML);