Blink roll 25b6bd3a7a131ffe68d809546ad1a20707915cdc:3a503f41ae42e5b79cfcd2ff10e65afde...
[chromium-blink-merge.git] / pdf / pdfium / pdfium_page.cc
blobd8a5dce5a2a2be4bb08e0a58117a8988c7ea91f1
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/pdfium/pdfium_page.h"
7 #include <math.h>
9 #include "base/logging.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "base/values.h"
14 #include "pdf/pdfium/pdfium_engine.h"
16 // Used when doing hit detection.
17 #define kTolerance 20.0
19 // Dictionary Value key names for returning the accessible page content as JSON.
20 const char kPageWidth[] = "width";
21 const char kPageHeight[] = "height";
22 const char kPageTextBox[] = "textBox";
23 const char kTextBoxLeft[] = "left";
24 const char kTextBoxTop[] = "top";
25 const char kTextBoxWidth[] = "width";
26 const char kTextBoxHeight[] = "height";
27 const char kTextBoxFontSize[] = "fontSize";
28 const char kTextBoxNodes[] = "textNodes";
29 const char kTextNodeType[] = "type";
30 const char kTextNodeText[] = "text";
31 const char kTextNodeURL[] = "url";
32 const char kTextNodeTypeText[] = "text";
33 const char kTextNodeTypeURL[] = "url";
34 const char kDocLinkURLPrefix[] = "#page";
36 namespace chrome_pdf {
38 PDFiumPage::PDFiumPage(PDFiumEngine* engine,
39 int i,
40 const pp::Rect& r,
41 bool available)
42 : engine_(engine),
43 page_(NULL),
44 text_page_(NULL),
45 index_(i),
46 rect_(r),
47 calculated_links_(false),
48 available_(available) {
51 PDFiumPage::~PDFiumPage() {
54 void PDFiumPage::Unload() {
55 if (text_page_) {
56 FPDFText_ClosePage(text_page_);
57 text_page_ = NULL;
60 if (page_) {
61 if (engine_->form()) {
62 FORM_OnBeforeClosePage(page_, engine_->form());
64 FPDF_ClosePage(page_);
65 page_ = NULL;
69 FPDF_PAGE PDFiumPage::GetPage() {
70 ScopedUnsupportedFeature scoped_unsupported_feature(engine_);
71 if (!available_)
72 return NULL;
73 if (!page_) {
74 page_ = FPDF_LoadPage(engine_->doc(), index_);
75 if (page_ && engine_->form()) {
76 FORM_OnAfterLoadPage(page_, engine_->form());
79 return page_;
82 FPDF_PAGE PDFiumPage::GetPrintPage() {
83 ScopedUnsupportedFeature scoped_unsupported_feature(engine_);
84 if (!available_)
85 return NULL;
86 if (!page_)
87 page_ = FPDF_LoadPage(engine_->doc(), index_);
88 return page_;
91 void PDFiumPage::ClosePrintPage() {
92 if (page_) {
93 FPDF_ClosePage(page_);
94 page_ = NULL;
98 FPDF_TEXTPAGE PDFiumPage::GetTextPage() {
99 if (!available_)
100 return NULL;
101 if (!text_page_)
102 text_page_ = FPDFText_LoadPage(GetPage());
103 return text_page_;
106 base::Value* PDFiumPage::GetAccessibleContentAsValue(int rotation) {
107 base::DictionaryValue* node = new base::DictionaryValue();
109 if (!available_)
110 return node;
112 double width = FPDF_GetPageWidth(GetPage());
113 double height = FPDF_GetPageHeight(GetPage());
115 base::ListValue* text = new base::ListValue();
116 int box_count = FPDFText_CountRects(GetTextPage(), 0, GetCharCount());
117 for (int i = 0; i < box_count; i++) {
118 double left, top, right, bottom;
119 FPDFText_GetRect(GetTextPage(), i, &left, &top, &right, &bottom);
120 text->Append(
121 GetTextBoxAsValue(height, left, top, right, bottom, rotation));
124 node->SetDouble(kPageWidth, width);
125 node->SetDouble(kPageHeight, height);
126 node->Set(kPageTextBox, text); // Takes ownership of |text|
128 return node;
131 base::Value* PDFiumPage::GetTextBoxAsValue(double page_height,
132 double left, double top,
133 double right, double bottom,
134 int rotation) {
135 base::string16 text_utf16;
136 int char_count =
137 FPDFText_GetBoundedText(GetTextPage(), left, top, right, bottom, NULL, 0);
138 if (char_count > 0) {
139 unsigned short* data = reinterpret_cast<unsigned short*>(
140 WriteInto(&text_utf16, char_count + 1));
141 FPDFText_GetBoundedText(GetTextPage(),
142 left, top, right, bottom,
143 data, char_count);
145 std::string text_utf8 = base::UTF16ToUTF8(text_utf16);
147 FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), left, top);
148 Area area;
149 std::vector<LinkTarget> targets;
150 if (link) {
151 targets.push_back(LinkTarget());
152 area = GetLinkTarget(link, &targets[0]);
153 } else {
154 pp::Rect rect(
155 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, rotation));
156 GetLinks(rect, &targets);
157 area = targets.size() == 0 ? TEXT_AREA : WEBLINK_AREA;
160 int char_index = FPDFText_GetCharIndexAtPos(GetTextPage(), left, top,
161 kTolerance, kTolerance);
162 double font_size = FPDFText_GetFontSize(GetTextPage(), char_index);
164 base::DictionaryValue* node = new base::DictionaryValue();
165 node->SetDouble(kTextBoxLeft, left);
166 node->SetDouble(kTextBoxTop, page_height - top);
167 node->SetDouble(kTextBoxWidth, right - left);
168 node->SetDouble(kTextBoxHeight, top - bottom);
169 node->SetDouble(kTextBoxFontSize, font_size);
171 base::ListValue* text_nodes = new base::ListValue();
173 if (area == DOCLINK_AREA) {
174 std::string url = kDocLinkURLPrefix + base::IntToString(targets[0].page);
175 text_nodes->Append(CreateURLNode(text_utf8, url));
176 } else if (area == WEBLINK_AREA && link) {
177 text_nodes->Append(CreateURLNode(text_utf8, targets[0].url));
178 } else if (area == WEBLINK_AREA && !link) {
179 size_t start = 0;
180 for (size_t i = 0; i < targets.size(); ++i) {
181 // Remove the extra NULL character at end.
182 // Otherwise, find() will not return any matches.
183 if (targets[i].url.size() > 0 &&
184 targets[i].url[targets[i].url.size() - 1] == '\0') {
185 targets[i].url.resize(targets[i].url.size() - 1);
187 // There should only ever be one NULL character
188 DCHECK(targets[i].url[targets[i].url.size() - 1] != '\0');
190 // PDFium may change the case of generated links.
191 std::string lowerCaseURL = base::StringToLowerASCII(targets[i].url);
192 std::string lowerCaseText = base::StringToLowerASCII(text_utf8);
193 size_t pos = lowerCaseText.find(lowerCaseURL, start);
194 size_t length = targets[i].url.size();
195 if (pos == std::string::npos) {
196 // Check if the link is a "mailto:" URL
197 if (lowerCaseURL.compare(0, 7, "mailto:") == 0) {
198 pos = lowerCaseText.find(lowerCaseURL.substr(7), start);
199 length -= 7;
202 if (pos == std::string::npos) {
203 // No match has been found. This should never happen.
204 continue;
208 std::string before_text = text_utf8.substr(start, pos - start);
209 if (before_text.size() > 0)
210 text_nodes->Append(CreateTextNode(before_text));
211 std::string link_text = text_utf8.substr(pos, length);
212 text_nodes->Append(CreateURLNode(link_text, targets[i].url));
214 start = pos + length;
216 std::string before_text = text_utf8.substr(start);
217 if (before_text.size() > 0)
218 text_nodes->Append(CreateTextNode(before_text));
219 } else {
220 text_nodes->Append(CreateTextNode(text_utf8));
223 node->Set(kTextBoxNodes, text_nodes); // Takes ownership of |text_nodes|.
224 return node;
227 base::Value* PDFiumPage::CreateTextNode(std::string text) {
228 base::DictionaryValue* node = new base::DictionaryValue();
229 node->SetString(kTextNodeType, kTextNodeTypeText);
230 node->SetString(kTextNodeText, text);
231 return node;
234 base::Value* PDFiumPage::CreateURLNode(std::string text, std::string url) {
235 base::DictionaryValue* node = new base::DictionaryValue();
236 node->SetString(kTextNodeType, kTextNodeTypeURL);
237 node->SetString(kTextNodeText, text);
238 node->SetString(kTextNodeURL, url);
239 return node;
242 PDFiumPage::Area PDFiumPage::GetCharIndex(const pp::Point& point,
243 int rotation,
244 int* char_index,
245 LinkTarget* target) {
246 if (!available_)
247 return NONSELECTABLE_AREA;
248 pp::Point point2 = point - rect_.point();
249 double new_x, new_y;
250 FPDF_DeviceToPage(GetPage(), 0, 0, rect_.width(), rect_.height(),
251 rotation, point2.x(), point2.y(), &new_x, &new_y);
253 int rv = FPDFText_GetCharIndexAtPos(
254 GetTextPage(), new_x, new_y, kTolerance, kTolerance);
255 *char_index = rv;
257 FPDF_LINK link = FPDFLink_GetLinkAtPoint(GetPage(), new_x, new_y);
258 if (link) {
259 // We don't handle all possible link types of the PDF. For example,
260 // launch actions, cross-document links, etc.
261 // In that case, GetLinkTarget() will return NONSELECTABLE_AREA
262 // and we should proceed with area detection.
263 PDFiumPage::Area area = GetLinkTarget(link, target);
264 if (area != PDFiumPage::NONSELECTABLE_AREA)
265 return area;
268 if (rv < 0)
269 return NONSELECTABLE_AREA;
271 return GetLink(*char_index, target) != -1 ? WEBLINK_AREA : TEXT_AREA;
274 base::char16 PDFiumPage::GetCharAtIndex(int index) {
275 if (!available_)
276 return L'\0';
277 return static_cast<base::char16>(FPDFText_GetUnicode(GetTextPage(), index));
280 int PDFiumPage::GetCharCount() {
281 if (!available_)
282 return 0;
283 return FPDFText_CountChars(GetTextPage());
286 PDFiumPage::Area PDFiumPage::GetLinkTarget(
287 FPDF_LINK link, PDFiumPage::LinkTarget* target) {
288 FPDF_DEST dest = FPDFLink_GetDest(engine_->doc(), link);
289 if (dest != NULL)
290 return GetDestinationTarget(dest, target);
292 FPDF_ACTION action = FPDFLink_GetAction(link);
293 if (action) {
294 switch (FPDFAction_GetType(action)) {
295 case PDFACTION_GOTO: {
296 FPDF_DEST dest = FPDFAction_GetDest(engine_->doc(), action);
297 if (dest)
298 return GetDestinationTarget(dest, target);
299 // TODO(gene): We don't fully support all types of the in-document
300 // links. Need to implement that. There is a bug to track that:
301 // http://code.google.com/p/chromium/issues/detail?id=55776
302 } break;
303 case PDFACTION_URI: {
304 if (target) {
305 size_t buffer_size =
306 FPDFAction_GetURIPath(engine_->doc(), action, NULL, 0);
307 if (buffer_size > 1) {
308 void* data = WriteInto(&target->url, buffer_size + 1);
309 FPDFAction_GetURIPath(engine_->doc(), action, data, buffer_size);
312 return WEBLINK_AREA;
313 } break;
314 // TODO(gene): We don't support PDFACTION_REMOTEGOTO and PDFACTION_LAUNCH
315 // at the moment.
319 return NONSELECTABLE_AREA;
322 PDFiumPage::Area PDFiumPage::GetDestinationTarget(
323 FPDF_DEST destination, PDFiumPage::LinkTarget* target) {
324 int page_index = FPDFDest_GetPageIndex(engine_->doc(), destination);
325 if (target) {
326 target->page = page_index;
328 return DOCLINK_AREA;
331 int PDFiumPage::GetLink(int char_index, PDFiumPage::LinkTarget* target) {
332 if (!available_)
333 return -1;
335 CalculateLinks();
337 // Get the bounding box of the rect again, since it might have moved because
338 // of the tolerance above.
339 double left, right, bottom, top;
340 FPDFText_GetCharBox(GetTextPage(), char_index, &left, &right, &bottom, &top);
342 pp::Point origin(
343 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0).point());
344 for (size_t i = 0; i < links_.size(); ++i) {
345 for (size_t j = 0; j < links_[i].rects.size(); ++j) {
346 if (links_[i].rects[j].Contains(origin)) {
347 if (target)
348 target->url = links_[i].url;
349 return i;
353 return -1;
356 std::vector<int> PDFiumPage::GetLinks(pp::Rect text_area,
357 std::vector<LinkTarget>* targets) {
358 if (!available_)
359 return std::vector<int>();
361 CalculateLinks();
363 std::vector<int> links;
365 for (size_t i = 0; i < links_.size(); ++i) {
366 for (size_t j = 0; j < links_[i].rects.size(); ++j) {
367 if (links_[i].rects[j].Intersects(text_area)) {
368 if (targets) {
369 LinkTarget target;
370 target.url = links_[i].url;
371 targets->push_back(target);
373 links.push_back(i);
377 return links;
380 void PDFiumPage::CalculateLinks() {
381 if (calculated_links_)
382 return;
384 calculated_links_ = true;
385 FPDF_PAGELINK links = FPDFLink_LoadWebLinks(GetTextPage());
386 int count = FPDFLink_CountWebLinks(links);
387 for (int i = 0; i < count; ++i) {
388 base::string16 url;
389 int url_length = FPDFLink_GetURL(links, i, NULL, 0);
390 if (url_length > 1) { // WriteInto needs at least 2 characters.
391 unsigned short* data =
392 reinterpret_cast<unsigned short*>(WriteInto(&url, url_length + 1));
393 FPDFLink_GetURL(links, i, data, url_length);
395 Link link;
396 link.url = base::UTF16ToUTF8(url);
398 // If the link cannot be converted to a pp::Var, then it is not possible to
399 // pass it to JS. In this case, ignore the link like other PDF viewers.
400 // See http://crbug.com/312882 for an example.
401 pp::Var link_var(link.url);
402 if (!link_var.is_string())
403 continue;
405 // Make sure all the characters in the URL are valid per RFC 1738.
406 // http://crbug.com/340326 has a sample bad PDF.
407 // GURL does not work correctly, e.g. it just strips \t \r \n.
408 bool is_invalid_url = false;
409 for (size_t j = 0; j < link.url.length(); ++j) {
410 // Control characters are not allowed.
411 // 0x7F is also a control character.
412 // 0x80 and above are not in US-ASCII.
413 if (link.url[j] < ' ' || link.url[j] >= '\x7F') {
414 is_invalid_url = true;
415 break;
418 if (is_invalid_url)
419 continue;
421 int rect_count = FPDFLink_CountRects(links, i);
422 for (int j = 0; j < rect_count; ++j) {
423 double left, top, right, bottom;
424 FPDFLink_GetRect(links, i, j, &left, &top, &right, &bottom);
425 link.rects.push_back(
426 PageToScreen(pp::Point(), 1.0, left, top, right, bottom, 0));
428 links_.push_back(link);
430 FPDFLink_CloseWebLinks(links);
433 pp::Rect PDFiumPage::PageToScreen(const pp::Point& offset,
434 double zoom,
435 double left,
436 double top,
437 double right,
438 double bottom,
439 int rotation) {
440 if (!available_)
441 return pp::Rect();
443 int new_left, new_top, new_right, new_bottom;
444 FPDF_PageToDevice(
445 page_,
446 static_cast<int>((rect_.x() - offset.x()) * zoom),
447 static_cast<int>((rect_.y() - offset.y()) * zoom),
448 static_cast<int>(ceil(rect_.width() * zoom)),
449 static_cast<int>(ceil(rect_.height() * zoom)),
450 rotation, left, top, &new_left, &new_top);
451 FPDF_PageToDevice(
452 page_,
453 static_cast<int>((rect_.x() - offset.x()) * zoom),
454 static_cast<int>((rect_.y() - offset.y()) * zoom),
455 static_cast<int>(ceil(rect_.width() * zoom)),
456 static_cast<int>(ceil(rect_.height() * zoom)),
457 rotation, right, bottom, &new_right, &new_bottom);
459 // If the PDF is rotated, the horizontal/vertical coordinates could be
460 // flipped. See
461 // http://www.netl.doe.gov/publications/proceedings/03/ubc/presentations/Goeckner-pres.pdf
462 if (new_right < new_left)
463 std::swap(new_right, new_left);
464 if (new_bottom < new_top)
465 std::swap(new_bottom, new_top);
467 return pp::Rect(
468 new_left, new_top, new_right - new_left + 1, new_bottom - new_top + 1);
471 PDFiumPage::Link::Link() {
474 PDFiumPage::Link::~Link() {
477 } // namespace chrome_pdf