1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/dom_distiller/core/page_features.h"
9 #include "base/json/json_reader.h"
10 #include "third_party/re2/re2/re2.h"
12 namespace dom_distiller
{
13 /* This code needs to derive features in the same way and order in which they
14 * are derived when training the model. Parts of that code are reproduced in the
20 std::string
GetLastSegment(const std::string
& path
) {
21 // return re.search('[^/]*\/?$', path).group(0)
24 size_t start
= path
.rfind("/", path
.size() - 1);
25 return start
== std::string::npos
? "" : path
.substr(start
+ 1);
28 int CountMatches(const std::string
& s
, const std::string
& p
) {
29 // return len(re.findall(p, s))
30 re2::StringPiece
sp(s
);
33 while (re2::RE2::FindAndConsume(&sp
, regexp
))
38 int GetWordCount(const std::string
& s
) {
39 return CountMatches(s
, "\\w+");
42 bool Contains(const std::string
& n
, const std::string
& h
) {
43 return h
.find(n
) != std::string::npos
;
46 bool EndsWith(const std::string
& t
, const std::string
& s
) {
47 return s
.size() >= t
.size() &&
48 s
.compare(s
.size() - t
.size(), std::string::npos
, t
) == 0;
53 int kDerivedFeaturesCount
= 29;
55 std::vector
<double> CalculateDerivedFeatures(bool isOGArticle
,
60 const std::string
& innerText
,
61 const std::string
& textContent
,
62 const std::string
& innerHTML
) {
63 // In the training pipeline, the strings are explicitly encoded in utf-8 (as
65 const std::string
& path
= url
.path();
66 int innerTextWords
= GetWordCount(innerText
);
67 int textContentWords
= GetWordCount(textContent
);
68 int innerHTMLWords
= GetWordCount(innerHTML
);
69 std::vector
<double> features
;
70 // 'opengraph', opengraph,
71 features
.push_back(isOGArticle
);
72 // 'forum', 'forum' in path,
73 features
.push_back(Contains("forum", path
));
74 // 'index', 'index' in path,
75 features
.push_back(Contains("index", path
));
76 // 'view', 'view' in path,
77 features
.push_back(Contains("view", path
));
78 // 'asp', '.asp' in path,
79 features
.push_back(Contains(".asp", path
));
80 // 'phpbb', 'phpbb' in path,
81 features
.push_back(Contains("phpbb", path
));
82 // 'php', path.endswith('.php'),
83 features
.push_back(EndsWith(".php", path
));
84 // 'pathlength', len(path),
85 features
.push_back(path
.size());
86 // 'domain', len(path) < 2,
87 features
.push_back(path
.size() < 2);
88 // 'pathcomponents', CountMatches(path, r'\/.'),
89 features
.push_back(CountMatches(path
, "\\/."));
90 // 'slugdetector', CountMatches(path, r'[^\w/]'),
91 features
.push_back(CountMatches(path
, "[^\\w/]"));
92 // 'pathnumbers', CountMatches(path, r'\d+'),
93 features
.push_back(CountMatches(path
, "\\d+"));
94 // 'lastSegmentLength', len(GetLastSegment(path)),
95 features
.push_back(GetLastSegment(path
).size());
96 // 'formcount', numForms,
97 features
.push_back(numForms
);
98 // 'anchorcount', numAnchors,
99 features
.push_back(numAnchors
);
100 // 'elementcount', numElements,
101 features
.push_back(numElements
);
102 // 'anchorratio', float(numAnchors) / max(1, numElements),
103 features
.push_back(double(numAnchors
) / std::max
<double>(1, numElements
));
104 // 'innertextlength', len(innerText),
105 features
.push_back(innerText
.size());
106 // 'textcontentlength', len(textContent),
107 features
.push_back(textContent
.size());
108 // 'innerhtmllength', len(innerHTML),
109 features
.push_back(innerHTML
.size());
110 // 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
111 features
.push_back(double(innerText
.size()) /
112 std::max
<double>(1.0, innerHTML
.size()));
113 // 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
114 features
.push_back(double(textContent
.size()) /
115 std::max
<double>(1.0, innerHTML
.size()));
116 // 'innertexttextcontentlengthratio',
117 // float(len(innerText)) / max(1, len(textContent)),
118 features
.push_back(double(innerText
.size()) /
119 std::max
<double>(1.0, textContent
.size()));
120 // 'innertextwordcount', innerTextWords,
121 features
.push_back(innerTextWords
);
122 // 'textcontentwordcount', textContentWords,
123 features
.push_back(textContentWords
);
124 // 'innerhtmlwordcount', innerHTMLWords,
125 features
.push_back(innerHTMLWords
);
126 // 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
127 features
.push_back(double(innerTextWords
) /
128 std::max
<int>(1.0, innerHTMLWords
));
129 // 'textcontentwordcountratio',
130 // float(textContentWords) / max(1, innerHTMLWords),
131 features
.push_back(double(textContentWords
) /
132 std::max
<int>(1.0, innerHTMLWords
));
133 // 'innertexttextcontentwordcountratio',
134 // float(innerTextWords) / max(1, textContentWords),
135 features
.push_back(double(innerTextWords
) /
136 std::max
<int>(1.0, textContentWords
));
140 std::vector
<double> CalculateDerivedFeaturesFromJSON(
141 const base::Value
* stringified_json
) {
142 std::string stringified
;
143 if (!stringified_json
->GetAsString(&stringified
)) {
144 return std::vector
<double>();
147 scoped_ptr
<base::Value
> json
= base::JSONReader::Read(stringified
);
149 return std::vector
<double>();
152 const base::DictionaryValue
* dict
;
153 if (!json
->GetAsDictionary(&dict
)) {
154 return std::vector
<double>();
157 bool isOGArticle
= false;
158 std::string url
, innerText
, textContent
, innerHTML
;
159 double numElements
= 0.0, numAnchors
= 0.0, numForms
= 0.0;
161 if (!(dict
->GetBoolean("opengraph", &isOGArticle
) &&
162 dict
->GetString("url", &url
) &&
163 dict
->GetDouble("numElements", &numElements
) &&
164 dict
->GetDouble("numAnchors", &numAnchors
) &&
165 dict
->GetDouble("numForms", &numForms
) &&
166 dict
->GetString("innerText", &innerText
) &&
167 dict
->GetString("textContent", &textContent
) &&
168 dict
->GetString("innerHTML", &innerHTML
))) {
169 return std::vector
<double>();
172 GURL
parsed_url(url
);
173 if (!parsed_url
.is_valid()) {
174 return std::vector
<double>();
177 return CalculateDerivedFeatures(isOGArticle
, parsed_url
, numElements
,
178 numAnchors
, numForms
, innerText
, textContent
,
182 } // namespace dom_distiller