1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/pdfium/pdfium_page.h"
9 #include "base/logging.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "base/values.h"
14 #include "pdf/pdfium/pdfium_api_string_buffer_adapter.h"
15 #include "pdf/pdfium/pdfium_engine.h"
17 // Used when doing hit detection.
18 #define kTolerance 20.0
22 // Dictionary Value key names for returning the accessible page content as JSON.
23 const char kPageWidth
[] = "width";
24 const char kPageHeight
[] = "height";
25 const char kPageTextBox
[] = "textBox";
26 const char kTextBoxLeft
[] = "left";
27 const char kTextBoxTop
[] = "top";
28 const char kTextBoxWidth
[] = "width";
29 const char kTextBoxHeight
[] = "height";
30 const char kTextBoxFontSize
[] = "fontSize";
31 const char kTextBoxNodes
[] = "textNodes";
32 const char kTextNodeType
[] = "type";
33 const char kTextNodeText
[] = "text";
34 const char kTextNodeURL
[] = "url";
35 const char kTextNodeTypeText
[] = "text";
36 const char kTextNodeTypeURL
[] = "url";
37 const char kDocLinkURLPrefix
[] = "#page";
41 namespace chrome_pdf
{
43 PDFiumPage::PDFiumPage(PDFiumEngine
* engine
,
53 calculated_links_(false),
54 available_(available
) {
57 PDFiumPage::~PDFiumPage() {
58 DCHECK_EQ(0, loading_count_
);
61 void PDFiumPage::Unload() {
62 // Do not unload while in the middle of a load.
67 FPDFText_ClosePage(text_page_
);
72 if (engine_
->form()) {
73 FORM_OnBeforeClosePage(page_
, engine_
->form());
75 FPDF_ClosePage(page_
);
80 FPDF_PAGE
PDFiumPage::GetPage() {
81 ScopedUnsupportedFeature
scoped_unsupported_feature(engine_
);
85 ScopedLoadCounter
scoped_load(this);
86 page_
= FPDF_LoadPage(engine_
->doc(), index_
);
87 if (page_
&& engine_
->form()) {
88 FORM_OnAfterLoadPage(page_
, engine_
->form());
94 FPDF_PAGE
PDFiumPage::GetPrintPage() {
95 ScopedUnsupportedFeature
scoped_unsupported_feature(engine_
);
99 ScopedLoadCounter
scoped_load(this);
100 page_
= FPDF_LoadPage(engine_
->doc(), index_
);
105 void PDFiumPage::ClosePrintPage() {
106 // Do not close |page_| while in the middle of a load.
111 FPDF_ClosePage(page_
);
116 FPDF_TEXTPAGE
PDFiumPage::GetTextPage() {
120 ScopedLoadCounter
scoped_load(this);
121 text_page_
= FPDFText_LoadPage(GetPage());
126 base::Value
* PDFiumPage::GetAccessibleContentAsValue(int rotation
) {
127 base::DictionaryValue
* node
= new base::DictionaryValue();
132 double width
= FPDF_GetPageWidth(GetPage());
133 double height
= FPDF_GetPageHeight(GetPage());
135 base::ListValue
* text
= new base::ListValue();
136 int box_count
= FPDFText_CountRects(GetTextPage(), 0, GetCharCount());
137 for (int i
= 0; i
< box_count
; i
++) {
138 double left
, top
, right
, bottom
;
139 FPDFText_GetRect(GetTextPage(), i
, &left
, &top
, &right
, &bottom
);
141 GetTextBoxAsValue(height
, left
, top
, right
, bottom
, rotation
));
144 node
->SetDouble(kPageWidth
, width
);
145 node
->SetDouble(kPageHeight
, height
);
146 node
->Set(kPageTextBox
, text
); // Takes ownership of |text|
151 base::Value
* PDFiumPage::GetTextBoxAsValue(double page_height
,
152 double left
, double top
,
153 double right
, double bottom
,
155 base::string16 text_utf16
;
157 FPDFText_GetBoundedText(GetTextPage(), left
, top
, right
, bottom
, NULL
, 0);
158 if (char_count
> 0) {
159 unsigned short* data
= reinterpret_cast<unsigned short*>(
160 WriteInto(&text_utf16
, char_count
+ 1));
161 FPDFText_GetBoundedText(GetTextPage(),
162 left
, top
, right
, bottom
,
165 std::string text_utf8
= base::UTF16ToUTF8(text_utf16
);
167 FPDF_LINK link
= FPDFLink_GetLinkAtPoint(GetPage(), left
, top
);
169 std::vector
<LinkTarget
> targets
;
171 targets
.push_back(LinkTarget());
172 area
= GetLinkTarget(link
, &targets
[0]);
175 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, rotation
));
176 GetLinks(rect
, &targets
);
177 area
= targets
.size() == 0 ? TEXT_AREA
: WEBLINK_AREA
;
180 int char_index
= FPDFText_GetCharIndexAtPos(GetTextPage(), left
, top
,
181 kTolerance
, kTolerance
);
182 double font_size
= FPDFText_GetFontSize(GetTextPage(), char_index
);
184 base::DictionaryValue
* node
= new base::DictionaryValue();
185 node
->SetDouble(kTextBoxLeft
, left
);
186 node
->SetDouble(kTextBoxTop
, page_height
- top
);
187 node
->SetDouble(kTextBoxWidth
, right
- left
);
188 node
->SetDouble(kTextBoxHeight
, top
- bottom
);
189 node
->SetDouble(kTextBoxFontSize
, font_size
);
191 base::ListValue
* text_nodes
= new base::ListValue();
193 if (area
== DOCLINK_AREA
) {
194 std::string url
= kDocLinkURLPrefix
+ base::IntToString(targets
[0].page
);
195 text_nodes
->Append(CreateURLNode(text_utf8
, url
));
196 } else if (area
== WEBLINK_AREA
&& link
) {
197 text_nodes
->Append(CreateURLNode(text_utf8
, targets
[0].url
));
198 } else if (area
== WEBLINK_AREA
&& !link
) {
200 for (size_t i
= 0; i
< targets
.size(); ++i
) {
201 // If there is an extra NULL character at end, find() will not return any
202 // matches. There should not be any though.
203 if (!targets
[i
].url
.empty())
204 DCHECK(targets
[i
].url
[targets
[i
].url
.size() - 1] != '\0');
206 // PDFium may change the case of generated links.
207 std::string lowerCaseURL
= base::StringToLowerASCII(targets
[i
].url
);
208 std::string lowerCaseText
= base::StringToLowerASCII(text_utf8
);
209 size_t pos
= lowerCaseText
.find(lowerCaseURL
, start
);
210 size_t length
= targets
[i
].url
.size();
211 if (pos
== std::string::npos
) {
212 // Check if the link is a "mailto:" URL
213 if (lowerCaseURL
.compare(0, 7, "mailto:") == 0) {
214 pos
= lowerCaseText
.find(lowerCaseURL
.substr(7), start
);
218 if (pos
== std::string::npos
) {
219 // No match has been found. This should never happen.
224 std::string before_text
= text_utf8
.substr(start
, pos
- start
);
225 if (before_text
.size() > 0)
226 text_nodes
->Append(CreateTextNode(before_text
));
227 std::string link_text
= text_utf8
.substr(pos
, length
);
228 text_nodes
->Append(CreateURLNode(link_text
, targets
[i
].url
));
230 start
= pos
+ length
;
232 std::string before_text
= text_utf8
.substr(start
);
233 if (before_text
.size() > 0)
234 text_nodes
->Append(CreateTextNode(before_text
));
236 text_nodes
->Append(CreateTextNode(text_utf8
));
239 node
->Set(kTextBoxNodes
, text_nodes
); // Takes ownership of |text_nodes|.
243 base::Value
* PDFiumPage::CreateTextNode(std::string text
) {
244 base::DictionaryValue
* node
= new base::DictionaryValue();
245 node
->SetString(kTextNodeType
, kTextNodeTypeText
);
246 node
->SetString(kTextNodeText
, text
);
250 base::Value
* PDFiumPage::CreateURLNode(std::string text
, std::string url
) {
251 base::DictionaryValue
* node
= new base::DictionaryValue();
252 node
->SetString(kTextNodeType
, kTextNodeTypeURL
);
253 node
->SetString(kTextNodeText
, text
);
254 node
->SetString(kTextNodeURL
, url
);
258 PDFiumPage::Area
PDFiumPage::GetCharIndex(const pp::Point
& point
,
262 LinkTarget
* target
) {
264 return NONSELECTABLE_AREA
;
265 pp::Point point2
= point
- rect_
.point();
267 FPDF_DeviceToPage(GetPage(), 0, 0, rect_
.width(), rect_
.height(),
268 rotation
, point2
.x(), point2
.y(), &new_x
, &new_y
);
270 int rv
= FPDFText_GetCharIndexAtPos(
271 GetTextPage(), new_x
, new_y
, kTolerance
, kTolerance
);
275 FPDPage_HasFormFieldAtPoint(engine_
->form(), GetPage(), new_x
, new_y
);
276 if (control
> FPDF_FORMFIELD_UNKNOWN
) {
277 *form_type
= control
;
278 return PDFiumPage::NONSELECTABLE_AREA
;
281 FPDF_LINK link
= FPDFLink_GetLinkAtPoint(GetPage(), new_x
, new_y
);
283 // We don't handle all possible link types of the PDF. For example,
284 // launch actions, cross-document links, etc.
285 // In that case, GetLinkTarget() will return NONSELECTABLE_AREA
286 // and we should proceed with area detection.
287 PDFiumPage::Area area
= GetLinkTarget(link
, target
);
288 if (area
!= PDFiumPage::NONSELECTABLE_AREA
)
293 return NONSELECTABLE_AREA
;
295 return GetLink(*char_index
, target
) != -1 ? WEBLINK_AREA
: TEXT_AREA
;
298 base::char16
PDFiumPage::GetCharAtIndex(int index
) {
301 return static_cast<base::char16
>(FPDFText_GetUnicode(GetTextPage(), index
));
304 int PDFiumPage::GetCharCount() {
307 return FPDFText_CountChars(GetTextPage());
310 PDFiumPage::Area
PDFiumPage::GetLinkTarget(
311 FPDF_LINK link
, PDFiumPage::LinkTarget
* target
) {
312 FPDF_DEST dest
= FPDFLink_GetDest(engine_
->doc(), link
);
314 return GetDestinationTarget(dest
, target
);
316 FPDF_ACTION action
= FPDFLink_GetAction(link
);
318 switch (FPDFAction_GetType(action
)) {
319 case PDFACTION_GOTO
: {
320 FPDF_DEST dest
= FPDFAction_GetDest(engine_
->doc(), action
);
322 return GetDestinationTarget(dest
, target
);
323 // TODO(gene): We don't fully support all types of the in-document
324 // links. Need to implement that. There is a bug to track that:
325 // http://code.google.com/p/chromium/issues/detail?id=55776
327 case PDFACTION_URI
: {
330 FPDFAction_GetURIPath(engine_
->doc(), action
, NULL
, 0);
331 if (buffer_size
> 0) {
332 PDFiumAPIStringBufferAdapter
<std::string
> api_string_adapter(
333 &target
->url
, buffer_size
, true);
334 void* data
= api_string_adapter
.GetData();
335 size_t bytes_written
= FPDFAction_GetURIPath(
336 engine_
->doc(), action
, data
, buffer_size
);
337 api_string_adapter
.Close(bytes_written
);
342 // TODO(gene): We don't support PDFACTION_REMOTEGOTO and PDFACTION_LAUNCH
347 return NONSELECTABLE_AREA
;
350 PDFiumPage::Area
PDFiumPage::GetDestinationTarget(
351 FPDF_DEST destination
, PDFiumPage::LinkTarget
* target
) {
352 int page_index
= FPDFDest_GetPageIndex(engine_
->doc(), destination
);
354 target
->page
= page_index
;
359 int PDFiumPage::GetLink(int char_index
, PDFiumPage::LinkTarget
* target
) {
365 // Get the bounding box of the rect again, since it might have moved because
366 // of the tolerance above.
367 double left
, right
, bottom
, top
;
368 FPDFText_GetCharBox(GetTextPage(), char_index
, &left
, &right
, &bottom
, &top
);
371 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, 0).point());
372 for (size_t i
= 0; i
< links_
.size(); ++i
) {
373 for (size_t j
= 0; j
< links_
[i
].rects
.size(); ++j
) {
374 if (links_
[i
].rects
[j
].Contains(origin
)) {
376 target
->url
= links_
[i
].url
;
384 std::vector
<int> PDFiumPage::GetLinks(pp::Rect text_area
,
385 std::vector
<LinkTarget
>* targets
) {
387 return std::vector
<int>();
391 std::vector
<int> links
;
393 for (size_t i
= 0; i
< links_
.size(); ++i
) {
394 for (size_t j
= 0; j
< links_
[i
].rects
.size(); ++j
) {
395 if (links_
[i
].rects
[j
].Intersects(text_area
)) {
398 target
.url
= links_
[i
].url
;
399 targets
->push_back(target
);
408 void PDFiumPage::CalculateLinks() {
409 if (calculated_links_
)
412 calculated_links_
= true;
413 FPDF_PAGELINK links
= FPDFLink_LoadWebLinks(GetTextPage());
414 int count
= FPDFLink_CountWebLinks(links
);
415 for (int i
= 0; i
< count
; ++i
) {
417 int url_length
= FPDFLink_GetURL(links
, i
, NULL
, 0);
418 if (url_length
> 0) {
419 PDFiumAPIStringBufferAdapter
<base::string16
> api_string_adapter(
420 &url
, url_length
, true);
421 unsigned short* data
=
422 reinterpret_cast<unsigned short*>(api_string_adapter
.GetData());
423 int actual_length
= FPDFLink_GetURL(links
, i
, data
, url_length
);
424 api_string_adapter
.Close(actual_length
);
427 link
.url
= base::UTF16ToUTF8(url
);
429 // If the link cannot be converted to a pp::Var, then it is not possible to
430 // pass it to JS. In this case, ignore the link like other PDF viewers.
431 // See http://crbug.com/312882 for an example.
432 pp::Var
link_var(link
.url
);
433 if (!link_var
.is_string())
436 // Make sure all the characters in the URL are valid per RFC 1738.
437 // http://crbug.com/340326 has a sample bad PDF.
438 // GURL does not work correctly, e.g. it just strips \t \r \n.
439 bool is_invalid_url
= false;
440 for (size_t j
= 0; j
< link
.url
.length(); ++j
) {
441 // Control characters are not allowed.
442 // 0x7F is also a control character.
443 // 0x80 and above are not in US-ASCII.
444 if (link
.url
[j
] < ' ' || link
.url
[j
] >= '\x7F') {
445 is_invalid_url
= true;
452 int rect_count
= FPDFLink_CountRects(links
, i
);
453 for (int j
= 0; j
< rect_count
; ++j
) {
454 double left
, top
, right
, bottom
;
455 FPDFLink_GetRect(links
, i
, j
, &left
, &top
, &right
, &bottom
);
456 link
.rects
.push_back(
457 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, 0));
459 links_
.push_back(link
);
461 FPDFLink_CloseWebLinks(links
);
464 pp::Rect
PDFiumPage::PageToScreen(const pp::Point
& offset
,
474 int new_left
, new_top
, new_right
, new_bottom
;
477 static_cast<int>((rect_
.x() - offset
.x()) * zoom
),
478 static_cast<int>((rect_
.y() - offset
.y()) * zoom
),
479 static_cast<int>(ceil(rect_
.width() * zoom
)),
480 static_cast<int>(ceil(rect_
.height() * zoom
)),
481 rotation
, left
, top
, &new_left
, &new_top
);
484 static_cast<int>((rect_
.x() - offset
.x()) * zoom
),
485 static_cast<int>((rect_
.y() - offset
.y()) * zoom
),
486 static_cast<int>(ceil(rect_
.width() * zoom
)),
487 static_cast<int>(ceil(rect_
.height() * zoom
)),
488 rotation
, right
, bottom
, &new_right
, &new_bottom
);
490 // If the PDF is rotated, the horizontal/vertical coordinates could be
492 // http://www.netl.doe.gov/publications/proceedings/03/ubc/presentations/Goeckner-pres.pdf
493 if (new_right
< new_left
)
494 std::swap(new_right
, new_left
);
495 if (new_bottom
< new_top
)
496 std::swap(new_bottom
, new_top
);
499 new_left
, new_top
, new_right
- new_left
+ 1, new_bottom
- new_top
+ 1);
502 PDFiumPage::ScopedLoadCounter::ScopedLoadCounter(PDFiumPage
* page
)
504 page_
->loading_count_
++;
507 PDFiumPage::ScopedLoadCounter::~ScopedLoadCounter() {
508 page_
->loading_count_
--;
511 PDFiumPage::Link::Link() {
514 PDFiumPage::Link::~Link() {
517 } // namespace chrome_pdf