1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/pdfium/pdfium_page.h"
9 #include "base/logging.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "base/values.h"
14 #include "pdf/pdfium/pdfium_api_string_buffer_adapter.h"
15 #include "pdf/pdfium/pdfium_engine.h"
17 // Used when doing hit detection.
18 #define kTolerance 20.0
22 // Dictionary Value key names for returning the accessible page content as JSON.
23 const char kPageWidth
[] = "width";
24 const char kPageHeight
[] = "height";
25 const char kPageTextBox
[] = "textBox";
26 const char kTextBoxLeft
[] = "left";
27 const char kTextBoxTop
[] = "top";
28 const char kTextBoxWidth
[] = "width";
29 const char kTextBoxHeight
[] = "height";
30 const char kTextBoxFontSize
[] = "fontSize";
31 const char kTextBoxNodes
[] = "textNodes";
32 const char kTextNodeType
[] = "type";
33 const char kTextNodeText
[] = "text";
34 const char kTextNodeURL
[] = "url";
35 const char kTextNodeTypeText
[] = "text";
36 const char kTextNodeTypeURL
[] = "url";
37 const char kDocLinkURLPrefix
[] = "#page";
41 namespace chrome_pdf
{
43 PDFiumPage::PDFiumPage(PDFiumEngine
* engine
,
53 calculated_links_(false),
54 available_(available
) {
57 PDFiumPage::~PDFiumPage() {
58 DCHECK_EQ(0, loading_count_
);
61 void PDFiumPage::Unload() {
62 // Do not unload while in the middle of a load.
67 FPDFText_ClosePage(text_page_
);
72 if (engine_
->form()) {
73 FORM_OnBeforeClosePage(page_
, engine_
->form());
75 FPDF_ClosePage(page_
);
80 FPDF_PAGE
PDFiumPage::GetPage() {
81 ScopedUnsupportedFeature
scoped_unsupported_feature(engine_
);
85 ScopedLoadCounter
scoped_load(this);
86 page_
= FPDF_LoadPage(engine_
->doc(), index_
);
87 if (page_
&& engine_
->form()) {
88 FORM_OnAfterLoadPage(page_
, engine_
->form());
94 FPDF_PAGE
PDFiumPage::GetPrintPage() {
95 ScopedUnsupportedFeature
scoped_unsupported_feature(engine_
);
99 ScopedLoadCounter
scoped_load(this);
100 page_
= FPDF_LoadPage(engine_
->doc(), index_
);
105 void PDFiumPage::ClosePrintPage() {
106 // Do not close |page_| while in the middle of a load.
111 FPDF_ClosePage(page_
);
116 FPDF_TEXTPAGE
PDFiumPage::GetTextPage() {
120 ScopedLoadCounter
scoped_load(this);
121 text_page_
= FPDFText_LoadPage(GetPage());
126 base::Value
* PDFiumPage::GetAccessibleContentAsValue(int rotation
) {
127 base::DictionaryValue
* node
= new base::DictionaryValue();
132 double width
= FPDF_GetPageWidth(GetPage());
133 double height
= FPDF_GetPageHeight(GetPage());
135 base::ListValue
* text
= new base::ListValue();
136 int box_count
= FPDFText_CountRects(GetTextPage(), 0, GetCharCount());
137 for (int i
= 0; i
< box_count
; i
++) {
138 double left
, top
, right
, bottom
;
139 FPDFText_GetRect(GetTextPage(), i
, &left
, &top
, &right
, &bottom
);
141 GetTextBoxAsValue(height
, left
, top
, right
, bottom
, rotation
));
144 node
->SetDouble(kPageWidth
, width
);
145 node
->SetDouble(kPageHeight
, height
);
146 node
->Set(kPageTextBox
, text
); // Takes ownership of |text|
151 base::Value
* PDFiumPage::GetTextBoxAsValue(double page_height
,
152 double left
, double top
,
153 double right
, double bottom
,
155 base::string16 text_utf16
;
157 FPDFText_GetBoundedText(GetTextPage(), left
, top
, right
, bottom
, NULL
, 0);
158 if (char_count
> 0) {
159 unsigned short* data
= reinterpret_cast<unsigned short*>(
160 base::WriteInto(&text_utf16
, char_count
+ 1));
161 FPDFText_GetBoundedText(GetTextPage(),
162 left
, top
, right
, bottom
,
165 std::string text_utf8
= base::UTF16ToUTF8(text_utf16
);
167 FPDF_LINK link
= FPDFLink_GetLinkAtPoint(GetPage(), left
, top
);
169 std::vector
<LinkTarget
> targets
;
171 targets
.push_back(LinkTarget());
172 area
= GetLinkTarget(link
, &targets
[0]);
175 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, rotation
));
176 GetLinks(rect
, &targets
);
177 area
= targets
.size() == 0 ? TEXT_AREA
: WEBLINK_AREA
;
180 int char_index
= FPDFText_GetCharIndexAtPos(GetTextPage(), left
, top
,
181 kTolerance
, kTolerance
);
182 double font_size
= FPDFText_GetFontSize(GetTextPage(), char_index
);
184 base::DictionaryValue
* node
= new base::DictionaryValue();
185 node
->SetDouble(kTextBoxLeft
, left
);
186 node
->SetDouble(kTextBoxTop
, page_height
- top
);
187 node
->SetDouble(kTextBoxWidth
, right
- left
);
188 node
->SetDouble(kTextBoxHeight
, top
- bottom
);
189 node
->SetDouble(kTextBoxFontSize
, font_size
);
191 base::ListValue
* text_nodes
= new base::ListValue();
193 if (area
== DOCLINK_AREA
) {
194 std::string url
= kDocLinkURLPrefix
+ base::IntToString(targets
[0].page
);
195 text_nodes
->Append(CreateURLNode(text_utf8
, url
));
196 } else if (area
== WEBLINK_AREA
&& link
) {
197 text_nodes
->Append(CreateURLNode(text_utf8
, targets
[0].url
));
198 } else if (area
== WEBLINK_AREA
&& !link
) {
200 for (size_t i
= 0; i
< targets
.size(); ++i
) {
201 // If there is an extra NULL character at end, find() will not return any
202 // matches. There should not be any though.
203 if (!targets
[i
].url
.empty())
204 DCHECK(targets
[i
].url
[targets
[i
].url
.size() - 1] != '\0');
206 // PDFium may change the case of generated links.
207 std::string lowerCaseURL
= base::ToLowerASCII(targets
[i
].url
);
208 std::string lowerCaseText
= base::ToLowerASCII(text_utf8
);
209 size_t pos
= lowerCaseText
.find(lowerCaseURL
, start
);
210 size_t length
= targets
[i
].url
.size();
211 if (pos
== std::string::npos
) {
212 // Check if the link is a "mailto:" URL
213 if (lowerCaseURL
.compare(0, 7, "mailto:") == 0) {
214 pos
= lowerCaseText
.find(lowerCaseURL
.substr(7), start
);
218 if (pos
== std::string::npos
) {
219 // No match has been found. This should never happen.
224 std::string before_text
= text_utf8
.substr(start
, pos
- start
);
225 if (before_text
.size() > 0)
226 text_nodes
->Append(CreateTextNode(before_text
));
227 std::string link_text
= text_utf8
.substr(pos
, length
);
228 text_nodes
->Append(CreateURLNode(link_text
, targets
[i
].url
));
230 start
= pos
+ length
;
232 std::string before_text
= text_utf8
.substr(start
);
233 if (before_text
.size() > 0)
234 text_nodes
->Append(CreateTextNode(before_text
));
236 text_nodes
->Append(CreateTextNode(text_utf8
));
239 node
->Set(kTextBoxNodes
, text_nodes
); // Takes ownership of |text_nodes|.
243 base::Value
* PDFiumPage::CreateTextNode(std::string text
) {
244 base::DictionaryValue
* node
= new base::DictionaryValue();
245 node
->SetString(kTextNodeType
, kTextNodeTypeText
);
246 node
->SetString(kTextNodeText
, text
);
250 base::Value
* PDFiumPage::CreateURLNode(std::string text
, std::string url
) {
251 base::DictionaryValue
* node
= new base::DictionaryValue();
252 node
->SetString(kTextNodeType
, kTextNodeTypeURL
);
253 node
->SetString(kTextNodeText
, text
);
254 node
->SetString(kTextNodeURL
, url
);
258 PDFiumPage::Area
PDFiumPage::GetCharIndex(const pp::Point
& point
,
262 LinkTarget
* target
) {
264 return NONSELECTABLE_AREA
;
265 pp::Point point2
= point
- rect_
.point();
268 FPDF_DeviceToPage(GetPage(), 0, 0, rect_
.width(), rect_
.height(),
269 rotation
, point2
.x(), point2
.y(), &new_x
, &new_y
);
271 int rv
= FPDFText_GetCharIndexAtPos(
272 GetTextPage(), new_x
, new_y
, kTolerance
, kTolerance
);
275 FPDF_LINK link
= FPDFLink_GetLinkAtPoint(GetPage(), new_x
, new_y
);
277 FPDPage_HasFormFieldAtPoint(engine_
->form(), GetPage(), new_x
, new_y
);
279 // If there is a control and link at the same point, figure out their z-order
280 // to determine which is on top.
281 if (link
&& control
> FPDF_FORMFIELD_UNKNOWN
) {
282 int control_z_order
= FPDFPage_FormFieldZOrderAtPoint(
283 engine_
->form(), GetPage(), new_x
, new_y
);
284 int link_z_order
= FPDFLink_GetLinkZOrderAtPoint(GetPage(), new_x
, new_y
);
285 DCHECK_NE(control_z_order
, link_z_order
);
286 if (control_z_order
> link_z_order
) {
287 *form_type
= control
;
288 return PDFiumPage::NONSELECTABLE_AREA
;
291 // We don't handle all possible link types of the PDF. For example,
292 // launch actions, cross-document links, etc.
293 // In that case, GetLinkTarget() will return NONSELECTABLE_AREA
294 // and we should proceed with area detection.
295 PDFiumPage::Area area
= GetLinkTarget(link
, target
);
296 if (area
!= PDFiumPage::NONSELECTABLE_AREA
)
299 // We don't handle all possible link types of the PDF. For example,
300 // launch actions, cross-document links, etc.
301 // See identical block above.
302 PDFiumPage::Area area
= GetLinkTarget(link
, target
);
303 if (area
!= PDFiumPage::NONSELECTABLE_AREA
)
305 } else if (control
> FPDF_FORMFIELD_UNKNOWN
) {
306 *form_type
= control
;
307 return PDFiumPage::NONSELECTABLE_AREA
;
311 return NONSELECTABLE_AREA
;
313 return GetLink(*char_index
, target
) != -1 ? WEBLINK_AREA
: TEXT_AREA
;
316 base::char16
PDFiumPage::GetCharAtIndex(int index
) {
319 return static_cast<base::char16
>(FPDFText_GetUnicode(GetTextPage(), index
));
322 int PDFiumPage::GetCharCount() {
325 return FPDFText_CountChars(GetTextPage());
328 PDFiumPage::Area
PDFiumPage::GetLinkTarget(
329 FPDF_LINK link
, PDFiumPage::LinkTarget
* target
) {
330 FPDF_DEST dest
= FPDFLink_GetDest(engine_
->doc(), link
);
332 return GetDestinationTarget(dest
, target
);
334 FPDF_ACTION action
= FPDFLink_GetAction(link
);
336 switch (FPDFAction_GetType(action
)) {
337 case PDFACTION_GOTO
: {
338 FPDF_DEST dest
= FPDFAction_GetDest(engine_
->doc(), action
);
340 return GetDestinationTarget(dest
, target
);
341 // TODO(gene): We don't fully support all types of the in-document
342 // links. Need to implement that. There is a bug to track that:
343 // http://code.google.com/p/chromium/issues/detail?id=55776
345 case PDFACTION_URI
: {
348 FPDFAction_GetURIPath(engine_
->doc(), action
, NULL
, 0);
349 if (buffer_size
> 0) {
350 PDFiumAPIStringBufferAdapter
<std::string
> api_string_adapter(
351 &target
->url
, buffer_size
, true);
352 void* data
= api_string_adapter
.GetData();
353 size_t bytes_written
= FPDFAction_GetURIPath(
354 engine_
->doc(), action
, data
, buffer_size
);
355 api_string_adapter
.Close(bytes_written
);
360 // TODO(gene): We don't support PDFACTION_REMOTEGOTO and PDFACTION_LAUNCH
365 return NONSELECTABLE_AREA
;
368 PDFiumPage::Area
PDFiumPage::GetDestinationTarget(
369 FPDF_DEST destination
, PDFiumPage::LinkTarget
* target
) {
370 int page_index
= FPDFDest_GetPageIndex(engine_
->doc(), destination
);
372 target
->page
= page_index
;
377 int PDFiumPage::GetLink(int char_index
, PDFiumPage::LinkTarget
* target
) {
383 // Get the bounding box of the rect again, since it might have moved because
384 // of the tolerance above.
385 double left
, right
, bottom
, top
;
386 FPDFText_GetCharBox(GetTextPage(), char_index
, &left
, &right
, &bottom
, &top
);
389 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, 0).point());
390 for (size_t i
= 0; i
< links_
.size(); ++i
) {
391 for (size_t j
= 0; j
< links_
[i
].rects
.size(); ++j
) {
392 if (links_
[i
].rects
[j
].Contains(origin
)) {
394 target
->url
= links_
[i
].url
;
402 std::vector
<int> PDFiumPage::GetLinks(pp::Rect text_area
,
403 std::vector
<LinkTarget
>* targets
) {
405 return std::vector
<int>();
409 std::vector
<int> links
;
411 for (size_t i
= 0; i
< links_
.size(); ++i
) {
412 for (size_t j
= 0; j
< links_
[i
].rects
.size(); ++j
) {
413 if (links_
[i
].rects
[j
].Intersects(text_area
)) {
416 target
.url
= links_
[i
].url
;
417 targets
->push_back(target
);
426 void PDFiumPage::CalculateLinks() {
427 if (calculated_links_
)
430 calculated_links_
= true;
431 FPDF_PAGELINK links
= FPDFLink_LoadWebLinks(GetTextPage());
432 int count
= FPDFLink_CountWebLinks(links
);
433 for (int i
= 0; i
< count
; ++i
) {
435 int url_length
= FPDFLink_GetURL(links
, i
, NULL
, 0);
436 if (url_length
> 0) {
437 PDFiumAPIStringBufferAdapter
<base::string16
> api_string_adapter(
438 &url
, url_length
, true);
439 unsigned short* data
=
440 reinterpret_cast<unsigned short*>(api_string_adapter
.GetData());
441 int actual_length
= FPDFLink_GetURL(links
, i
, data
, url_length
);
442 api_string_adapter
.Close(actual_length
);
445 link
.url
= base::UTF16ToUTF8(url
);
447 // If the link cannot be converted to a pp::Var, then it is not possible to
448 // pass it to JS. In this case, ignore the link like other PDF viewers.
449 // See http://crbug.com/312882 for an example.
450 pp::Var
link_var(link
.url
);
451 if (!link_var
.is_string())
454 // Make sure all the characters in the URL are valid per RFC 1738.
455 // http://crbug.com/340326 has a sample bad PDF.
456 // GURL does not work correctly, e.g. it just strips \t \r \n.
457 bool is_invalid_url
= false;
458 for (size_t j
= 0; j
< link
.url
.length(); ++j
) {
459 // Control characters are not allowed.
460 // 0x7F is also a control character.
461 // 0x80 and above are not in US-ASCII.
462 if (link
.url
[j
] < ' ' || link
.url
[j
] >= '\x7F') {
463 is_invalid_url
= true;
470 int rect_count
= FPDFLink_CountRects(links
, i
);
471 for (int j
= 0; j
< rect_count
; ++j
) {
472 double left
, top
, right
, bottom
;
473 FPDFLink_GetRect(links
, i
, j
, &left
, &top
, &right
, &bottom
);
474 link
.rects
.push_back(
475 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, 0));
477 links_
.push_back(link
);
479 FPDFLink_CloseWebLinks(links
);
482 pp::Rect
PDFiumPage::PageToScreen(const pp::Point
& offset
,
492 int new_left
, new_top
, new_right
, new_bottom
;
495 static_cast<int>((rect_
.x() - offset
.x()) * zoom
),
496 static_cast<int>((rect_
.y() - offset
.y()) * zoom
),
497 static_cast<int>(ceil(rect_
.width() * zoom
)),
498 static_cast<int>(ceil(rect_
.height() * zoom
)),
499 rotation
, left
, top
, &new_left
, &new_top
);
502 static_cast<int>((rect_
.x() - offset
.x()) * zoom
),
503 static_cast<int>((rect_
.y() - offset
.y()) * zoom
),
504 static_cast<int>(ceil(rect_
.width() * zoom
)),
505 static_cast<int>(ceil(rect_
.height() * zoom
)),
506 rotation
, right
, bottom
, &new_right
, &new_bottom
);
508 // If the PDF is rotated, the horizontal/vertical coordinates could be
510 // http://www.netl.doe.gov/publications/proceedings/03/ubc/presentations/Goeckner-pres.pdf
511 if (new_right
< new_left
)
512 std::swap(new_right
, new_left
);
513 if (new_bottom
< new_top
)
514 std::swap(new_bottom
, new_top
);
517 new_left
, new_top
, new_right
- new_left
+ 1, new_bottom
- new_top
+ 1);
520 PDFiumPage::ScopedLoadCounter::ScopedLoadCounter(PDFiumPage
* page
)
522 page_
->loading_count_
++;
525 PDFiumPage::ScopedLoadCounter::~ScopedLoadCounter() {
526 page_
->loading_count_
--;
529 PDFiumPage::Link::Link() {
532 PDFiumPage::Link::~Link() {
535 } // namespace chrome_pdf