1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/dom_distiller/core/page_features.h"
9 #include "base/json/json_reader.h"
10 #include "third_party/re2/re2/re2.h"
12 namespace dom_distiller
{
13 /* This code needs to derive features in the same way and order in which they
14 * are derived when training the model. Parts of that code are reproduced in the
19 std::string
GetLastSegment(const std::string
& path
) {
20 // return re.search('[^/]*\/?$', path).group(0)
23 size_t start
= path
.rfind("/", path
.size() - 1);
24 return start
== std::string::npos
? "" : path
.substr(start
+ 1);
27 int CountMatches(const std::string
& s
, const std::string
& p
) {
28 // return len(re.findall(p, s))
29 re2::StringPiece
sp(s
);
32 while (re2::RE2::FindAndConsume(&sp
, regexp
))
37 int GetWordCount(const std::string
& s
) {
38 return CountMatches(s
, "\\w+");
41 bool Contains(const std::string
& n
, const std::string
& h
) {
42 return h
.find(n
) != std::string::npos
;
45 bool EndsWith(const std::string
& t
, const std::string
& s
) {
46 return s
.size() >= t
.size() &&
47 s
.compare(s
.size() - t
.size(), std::string::npos
, t
) == 0;
51 int kDerivedFeaturesCount
= 29;
53 std::vector
<double> CalculateDerivedFeatures(bool isOGArticle
,
58 const std::string
& innerText
,
59 const std::string
& textContent
,
60 const std::string
& innerHTML
) {
61 // In the training pipeline, the strings are explicitly encoded in utf-8 (as
63 const std::string
& path
= url
.path();
64 int innerTextWords
= GetWordCount(innerText
);
65 int textContentWords
= GetWordCount(textContent
);
66 int innerHTMLWords
= GetWordCount(innerHTML
);
67 std::vector
<double> features
;
68 // 'opengraph', opengraph,
69 features
.push_back(isOGArticle
);
70 // 'forum', 'forum' in path,
71 features
.push_back(Contains("forum", path
));
72 // 'index', 'index' in path,
73 features
.push_back(Contains("index", path
));
74 // 'view', 'view' in path,
75 features
.push_back(Contains("view", path
));
76 // 'asp', '.asp' in path,
77 features
.push_back(Contains(".asp", path
));
78 // 'phpbb', 'phpbb' in path,
79 features
.push_back(Contains("phpbb", path
));
80 // 'php', path.endswith('.php'),
81 features
.push_back(EndsWith(".php", path
));
82 // 'pathlength', len(path),
83 features
.push_back(path
.size());
84 // 'domain', len(path) < 2,
85 features
.push_back(path
.size() < 2);
86 // 'pathcomponents', CountMatches(path, r'\/.'),
87 features
.push_back(CountMatches(path
, "\\/."));
88 // 'slugdetector', CountMatches(path, r'[^\w/]'),
89 features
.push_back(CountMatches(path
, "[^\\w/]"));
90 // 'pathnumbers', CountMatches(path, r'\d+'),
91 features
.push_back(CountMatches(path
, "\\d+"));
92 // 'lastSegmentLength', len(GetLastSegment(path)),
93 features
.push_back(GetLastSegment(path
).size());
94 // 'formcount', numForms,
95 features
.push_back(numForms
);
96 // 'anchorcount', numAnchors,
97 features
.push_back(numAnchors
);
98 // 'elementcount', numElements,
99 features
.push_back(numElements
);
100 // 'anchorratio', float(numAnchors) / max(1, numElements),
101 features
.push_back(double(numAnchors
) / std::max
<double>(1, numElements
));
102 // 'innertextlength', len(innerText),
103 features
.push_back(innerText
.size());
104 // 'textcontentlength', len(textContent),
105 features
.push_back(textContent
.size());
106 // 'innerhtmllength', len(innerHTML),
107 features
.push_back(innerHTML
.size());
108 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
109 features
.push_back(double(innerText
.size()) /
110 std::max
<double>(1.0, innerHTML
.size()));
111 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
112 features
.push_back(double(textContent
.size()) /
113 std::max
<double>(1.0, innerHTML
.size()));
114 // 'innertexttextcontentlengthratio',
115 // float(len(innerText)) / max(1, len(textContent)),
116 features
.push_back(double(innerText
.size()) /
117 std::max
<double>(1.0, textContent
.size()));
118 // 'innertextwordcount', innerTextWords,
119 features
.push_back(innerTextWords
);
120 // 'textcontentwordcount', textContentWords,
121 features
.push_back(textContentWords
);
122 // 'innerhtmlwordcount', innerHTMLWords,
123 features
.push_back(innerHTMLWords
);
124 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
125 features
.push_back(double(innerTextWords
) /
126 std::max
<int>(1.0, innerHTMLWords
));
127 // 'textcontentwordcountratio',
128 // float(textContentWords) / max(1, innerHTMLWords),
129 features
.push_back(double(textContentWords
) /
130 std::max
<int>(1.0, innerHTMLWords
));
131 // 'innertexttextcontentwordcountratio',
132 // float(innerTextWords) / max(1, textContentWords),
133 features
.push_back(double(innerTextWords
) /
134 std::max
<int>(1.0, textContentWords
));
138 std::vector
<double> CalculateDerivedFeaturesFromJSON(
139 const base::Value
* stringified_json
) {
140 std::string stringified
;
141 if (!stringified_json
->GetAsString(&stringified
)) {
142 return std::vector
<double>();
145 scoped_ptr
<base::Value
> json(base::JSONReader::Read(stringified
));
147 return std::vector
<double>();
150 const base::DictionaryValue
* dict
;
151 if (!json
->GetAsDictionary(&dict
)) {
152 return std::vector
<double>();
155 bool isOGArticle
= false;
156 std::string url
, innerText
, textContent
, innerHTML
;
157 double numElements
= 0.0, numAnchors
= 0.0, numForms
= 0.0;
159 if (!(dict
->GetBoolean("opengraph", &isOGArticle
) &&
160 dict
->GetString("url", &url
) &&
161 dict
->GetDouble("numElements", &numElements
) &&
162 dict
->GetDouble("numAnchors", &numAnchors
) &&
163 dict
->GetDouble("numForms", &numForms
) &&
164 dict
->GetString("innerText", &innerText
) &&
165 dict
->GetString("textContent", &textContent
) &&
166 dict
->GetString("innerHTML", &innerHTML
))) {
167 return std::vector
<double>();
170 GURL
parsed_url(url
);
171 if (!parsed_url
.is_valid()) {
172 return std::vector
<double>();
175 return CalculateDerivedFeatures(isOGArticle
, parsed_url
, numElements
,
176 numAnchors
, numForms
, innerText
, textContent
,