1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/pdfium/pdfium_page.h"
9 #include "base/logging.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "base/values.h"
14 #include "pdf/pdfium/pdfium_engine.h"
16 // Used when doing hit detection.
17 #define kTolerance 20.0
19 // Dictionary Value key names for returning the accessible page content as JSON.
20 const char kPageWidth
[] = "width";
21 const char kPageHeight
[] = "height";
22 const char kPageTextBox
[] = "textBox";
23 const char kTextBoxLeft
[] = "left";
24 const char kTextBoxTop
[] = "top";
25 const char kTextBoxWidth
[] = "width";
26 const char kTextBoxHeight
[] = "height";
27 const char kTextBoxFontSize
[] = "fontSize";
28 const char kTextBoxNodes
[] = "textNodes";
29 const char kTextNodeType
[] = "type";
30 const char kTextNodeText
[] = "text";
31 const char kTextNodeURL
[] = "url";
32 const char kTextNodeTypeText
[] = "text";
33 const char kTextNodeTypeURL
[] = "url";
34 const char kDocLinkURLPrefix
[] = "#page";
36 namespace chrome_pdf
{
38 PDFiumPage::PDFiumPage(PDFiumEngine
* engine
,
47 calculated_links_(false),
48 available_(available
) {
51 PDFiumPage::~PDFiumPage() {
55 void PDFiumPage::Unload() {
57 FPDFText_ClosePage(text_page_
);
62 if (engine_
->form()) {
63 FORM_OnBeforeClosePage(page_
, engine_
->form());
65 FPDF_ClosePage(page_
);
70 FPDF_PAGE
PDFiumPage::GetPage() {
71 ScopedUnsupportedFeature
scoped_unsupported_feature(engine_
);
75 page_
= FPDF_LoadPage(engine_
->doc(), index_
);
76 if (page_
&& engine_
->form()) {
77 FORM_OnAfterLoadPage(page_
, engine_
->form());
83 FPDF_PAGE
PDFiumPage::GetPrintPage() {
84 ScopedUnsupportedFeature
scoped_unsupported_feature(engine_
);
88 page_
= FPDF_LoadPage(engine_
->doc(), index_
);
92 void PDFiumPage::ClosePrintPage() {
94 FPDF_ClosePage(page_
);
99 FPDF_TEXTPAGE
PDFiumPage::GetTextPage() {
103 text_page_
= FPDFText_LoadPage(GetPage());
107 base::Value
* PDFiumPage::GetAccessibleContentAsValue(int rotation
) {
108 base::DictionaryValue
* node
= new base::DictionaryValue();
113 double width
= FPDF_GetPageWidth(GetPage());
114 double height
= FPDF_GetPageHeight(GetPage());
116 base::ListValue
* text
= new base::ListValue();
117 int box_count
= FPDFText_CountRects(GetTextPage(), 0, GetCharCount());
118 for (int i
= 0; i
< box_count
; i
++) {
119 double left
, top
, right
, bottom
;
120 FPDFText_GetRect(GetTextPage(), i
, &left
, &top
, &right
, &bottom
);
122 GetTextBoxAsValue(height
, left
, top
, right
, bottom
, rotation
));
125 node
->SetDouble(kPageWidth
, width
);
126 node
->SetDouble(kPageHeight
, height
);
127 node
->Set(kPageTextBox
, text
); // Takes ownership of |text|
132 base::Value
* PDFiumPage::GetTextBoxAsValue(double page_height
,
133 double left
, double top
,
134 double right
, double bottom
,
136 base::string16 text_utf16
;
138 FPDFText_GetBoundedText(GetTextPage(), left
, top
, right
, bottom
, NULL
, 0);
139 if (char_count
> 0) {
140 unsigned short* data
= reinterpret_cast<unsigned short*>(
141 WriteInto(&text_utf16
, char_count
+ 1));
142 FPDFText_GetBoundedText(GetTextPage(),
143 left
, top
, right
, bottom
,
146 std::string text_utf8
= base::UTF16ToUTF8(text_utf16
);
148 FPDF_LINK link
= FPDFLink_GetLinkAtPoint(GetPage(), left
, top
);
150 std::vector
<LinkTarget
> targets
;
152 targets
.push_back(LinkTarget());
153 area
= GetLinkTarget(link
, &targets
[0]);
156 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, rotation
));
157 GetLinks(rect
, &targets
);
158 area
= targets
.size() == 0 ? TEXT_AREA
: WEBLINK_AREA
;
161 int char_index
= FPDFText_GetCharIndexAtPos(GetTextPage(), left
, top
,
162 kTolerance
, kTolerance
);
163 double font_size
= FPDFText_GetFontSize(GetTextPage(), char_index
);
165 base::DictionaryValue
* node
= new base::DictionaryValue();
166 node
->SetDouble(kTextBoxLeft
, left
);
167 node
->SetDouble(kTextBoxTop
, page_height
- top
);
168 node
->SetDouble(kTextBoxWidth
, right
- left
);
169 node
->SetDouble(kTextBoxHeight
, top
- bottom
);
170 node
->SetDouble(kTextBoxFontSize
, font_size
);
172 base::ListValue
* text_nodes
= new base::ListValue();
174 if (area
== DOCLINK_AREA
) {
175 std::string url
= kDocLinkURLPrefix
+ base::IntToString(targets
[0].page
);
176 text_nodes
->Append(CreateURLNode(text_utf8
, url
));
177 } else if (area
== WEBLINK_AREA
&& link
) {
178 text_nodes
->Append(CreateURLNode(text_utf8
, targets
[0].url
));
179 } else if (area
== WEBLINK_AREA
&& !link
) {
181 for (size_t i
= 0; i
< targets
.size(); ++i
) {
182 // Remove the extra NULL character at end.
183 // Otherwise, find() will not return any matches.
184 if (targets
[i
].url
.size() > 0 &&
185 targets
[i
].url
[targets
[i
].url
.size() - 1] == '\0') {
186 targets
[i
].url
.resize(targets
[i
].url
.size() - 1);
188 // There should only ever be one NULL character
189 DCHECK(targets
[i
].url
[targets
[i
].url
.size() - 1] != '\0');
191 // PDFium may change the case of generated links.
192 std::string lowerCaseURL
= base::StringToLowerASCII(targets
[i
].url
);
193 std::string lowerCaseText
= base::StringToLowerASCII(text_utf8
);
194 size_t pos
= lowerCaseText
.find(lowerCaseURL
, start
);
195 size_t length
= targets
[i
].url
.size();
196 if (pos
== std::string::npos
) {
197 // Check if the link is a "mailto:" URL
198 if (lowerCaseURL
.compare(0, 7, "mailto:") == 0) {
199 pos
= lowerCaseText
.find(lowerCaseURL
.substr(7), start
);
203 if (pos
== std::string::npos
) {
204 // No match has been found. This should never happen.
209 std::string before_text
= text_utf8
.substr(start
, pos
- start
);
210 if (before_text
.size() > 0)
211 text_nodes
->Append(CreateTextNode(before_text
));
212 std::string link_text
= text_utf8
.substr(pos
, length
);
213 text_nodes
->Append(CreateURLNode(link_text
, targets
[i
].url
));
215 start
= pos
+ length
;
217 std::string before_text
= text_utf8
.substr(start
);
218 if (before_text
.size() > 0)
219 text_nodes
->Append(CreateTextNode(before_text
));
221 text_nodes
->Append(CreateTextNode(text_utf8
));
224 node
->Set(kTextBoxNodes
, text_nodes
); // Takes ownership of |text_nodes|.
228 base::Value
* PDFiumPage::CreateTextNode(std::string text
) {
229 base::DictionaryValue
* node
= new base::DictionaryValue();
230 node
->SetString(kTextNodeType
, kTextNodeTypeText
);
231 node
->SetString(kTextNodeText
, text
);
235 base::Value
* PDFiumPage::CreateURLNode(std::string text
, std::string url
) {
236 base::DictionaryValue
* node
= new base::DictionaryValue();
237 node
->SetString(kTextNodeType
, kTextNodeTypeURL
);
238 node
->SetString(kTextNodeText
, text
);
239 node
->SetString(kTextNodeURL
, url
);
243 PDFiumPage::Area
PDFiumPage::GetCharIndex(const pp::Point
& point
,
246 LinkTarget
* target
) {
248 return NONSELECTABLE_AREA
;
249 pp::Point point2
= point
- rect_
.point();
251 FPDF_DeviceToPage(GetPage(), 0, 0, rect_
.width(), rect_
.height(),
252 rotation
, point2
.x(), point2
.y(), &new_x
, &new_y
);
254 int rv
= FPDFText_GetCharIndexAtPos(
255 GetTextPage(), new_x
, new_y
, kTolerance
, kTolerance
);
258 FPDF_LINK link
= FPDFLink_GetLinkAtPoint(GetPage(), new_x
, new_y
);
260 // We don't handle all possible link types of the PDF. For example,
261 // launch actions, cross-document links, etc.
262 // In that case, GetLinkTarget() will return NONSELECTABLE_AREA
263 // and we should proceed with area detection.
264 PDFiumPage::Area area
= GetLinkTarget(link
, target
);
265 if (area
!= PDFiumPage::NONSELECTABLE_AREA
)
270 return NONSELECTABLE_AREA
;
272 return GetLink(*char_index
, target
) != -1 ? WEBLINK_AREA
: TEXT_AREA
;
275 base::char16
PDFiumPage::GetCharAtIndex(int index
) {
278 return static_cast<base::char16
>(FPDFText_GetUnicode(GetTextPage(), index
));
281 int PDFiumPage::GetCharCount() {
284 return FPDFText_CountChars(GetTextPage());
287 PDFiumPage::Area
PDFiumPage::GetLinkTarget(
288 FPDF_LINK link
, PDFiumPage::LinkTarget
* target
) {
289 FPDF_DEST dest
= FPDFLink_GetDest(engine_
->doc(), link
);
291 return GetDestinationTarget(dest
, target
);
293 FPDF_ACTION action
= FPDFLink_GetAction(link
);
295 switch (FPDFAction_GetType(action
)) {
296 case PDFACTION_GOTO
: {
297 FPDF_DEST dest
= FPDFAction_GetDest(engine_
->doc(), action
);
299 return GetDestinationTarget(dest
, target
);
300 // TODO(gene): We don't fully support all types of the in-document
301 // links. Need to implement that. There is a bug to track that:
302 // http://code.google.com/p/chromium/issues/detail?id=55776
304 case PDFACTION_URI
: {
307 FPDFAction_GetURIPath(engine_
->doc(), action
, NULL
, 0);
308 if (buffer_size
> 1) {
309 void* data
= WriteInto(&target
->url
, buffer_size
);
310 FPDFAction_GetURIPath(engine_
->doc(), action
, data
, buffer_size
);
315 // TODO(gene): We don't support PDFACTION_REMOTEGOTO and PDFACTION_LAUNCH
320 return NONSELECTABLE_AREA
;
323 PDFiumPage::Area
PDFiumPage::GetDestinationTarget(
324 FPDF_DEST destination
, PDFiumPage::LinkTarget
* target
) {
325 int page_index
= FPDFDest_GetPageIndex(engine_
->doc(), destination
);
327 target
->page
= page_index
;
332 int PDFiumPage::GetLink(int char_index
, PDFiumPage::LinkTarget
* target
) {
338 // Get the bounding box of the rect again, since it might have moved because
339 // of the tolerance above.
340 double left
, right
, bottom
, top
;
341 FPDFText_GetCharBox(GetTextPage(), char_index
, &left
, &right
, &bottom
, &top
);
344 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, 0).point());
345 for (size_t i
= 0; i
< links_
.size(); ++i
) {
346 for (size_t j
= 0; j
< links_
[i
].rects
.size(); ++j
) {
347 if (links_
[i
].rects
[j
].Contains(origin
)) {
349 target
->url
= links_
[i
].url
;
357 std::vector
<int> PDFiumPage::GetLinks(pp::Rect text_area
,
358 std::vector
<LinkTarget
>* targets
) {
360 return std::vector
<int>();
364 std::vector
<int> links
;
366 for (size_t i
= 0; i
< links_
.size(); ++i
) {
367 for (size_t j
= 0; j
< links_
[i
].rects
.size(); ++j
) {
368 if (links_
[i
].rects
[j
].Intersects(text_area
)) {
371 target
.url
= links_
[i
].url
;
372 targets
->push_back(target
);
381 void PDFiumPage::CalculateLinks() {
382 if (calculated_links_
)
385 calculated_links_
= true;
386 FPDF_PAGELINK links
= FPDFLink_LoadWebLinks(GetTextPage());
387 int count
= FPDFLink_CountWebLinks(links
);
388 for (int i
= 0; i
< count
; ++i
) {
390 int url_length
= FPDFLink_GetURL(links
, i
, NULL
, 0);
391 if (url_length
> 1) { // WriteInto needs at least 2 characters.
392 unsigned short* data
=
393 reinterpret_cast<unsigned short*>(WriteInto(&url
, url_length
));
394 FPDFLink_GetURL(links
, i
, data
, url_length
);
397 link
.url
= base::UTF16ToUTF8(url
);
399 // If the link cannot be converted to a pp::Var, then it is not possible to
400 // pass it to JS. In this case, ignore the link like other PDF viewers.
401 // See http://crbug.com/312882 for an example.
402 pp::Var
link_var(link
.url
);
403 if (!link_var
.is_string())
406 // Make sure all the characters in the URL are valid per RFC 1738.
407 // http://crbug.com/340326 has a sample bad PDF.
408 // GURL does not work correctly, e.g. it just strips \t \r \n.
409 bool is_invalid_url
= false;
410 for (size_t j
= 0; j
< link
.url
.length(); ++j
) {
411 // Control characters are not allowed.
412 // 0x7F is also a control character.
413 // 0x80 and above are not in US-ASCII.
414 if (link
.url
[j
] < ' ' || link
.url
[j
] >= '\x7F') {
415 is_invalid_url
= true;
422 int rect_count
= FPDFLink_CountRects(links
, i
);
423 for (int j
= 0; j
< rect_count
; ++j
) {
424 double left
, top
, right
, bottom
;
425 FPDFLink_GetRect(links
, i
, j
, &left
, &top
, &right
, &bottom
);
426 link
.rects
.push_back(
427 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, 0));
429 links_
.push_back(link
);
431 FPDFLink_CloseWebLinks(links
);
434 pp::Rect
PDFiumPage::PageToScreen(const pp::Point
& offset
,
444 int new_left
, new_top
, new_right
, new_bottom
;
447 static_cast<int>((rect_
.x() - offset
.x()) * zoom
),
448 static_cast<int>((rect_
.y() - offset
.y()) * zoom
),
449 static_cast<int>(ceil(rect_
.width() * zoom
)),
450 static_cast<int>(ceil(rect_
.height() * zoom
)),
451 rotation
, left
, top
, &new_left
, &new_top
);
454 static_cast<int>((rect_
.x() - offset
.x()) * zoom
),
455 static_cast<int>((rect_
.y() - offset
.y()) * zoom
),
456 static_cast<int>(ceil(rect_
.width() * zoom
)),
457 static_cast<int>(ceil(rect_
.height() * zoom
)),
458 rotation
, right
, bottom
, &new_right
, &new_bottom
);
460 // If the PDF is rotated, the horizontal/vertical coordinates could be
462 // http://www.netl.doe.gov/publications/proceedings/03/ubc/presentations/Goeckner-pres.pdf
463 if (new_right
< new_left
)
464 std::swap(new_right
, new_left
);
465 if (new_bottom
< new_top
)
466 std::swap(new_bottom
, new_top
);
469 new_left
, new_top
, new_right
- new_left
+ 1, new_bottom
- new_top
+ 1);
472 PDFiumPage::Link::Link() {
475 PDFiumPage::Link::~Link() {
478 } // namespace chrome_pdf