1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "pdf/pdfium/pdfium_page.h"
9 #include "base/logging.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversions.h"
13 #include "base/values.h"
14 #include "pdf/pdfium/pdfium_engine.h"
16 // Used when doing hit detection.
17 #define kTolerance 20.0
19 // Dictionary Value key names for returning the accessible page content as JSON.
20 const char kPageWidth
[] = "width";
21 const char kPageHeight
[] = "height";
22 const char kPageTextBox
[] = "textBox";
23 const char kTextBoxLeft
[] = "left";
24 const char kTextBoxTop
[] = "top";
25 const char kTextBoxWidth
[] = "width";
26 const char kTextBoxHeight
[] = "height";
27 const char kTextBoxFontSize
[] = "fontSize";
28 const char kTextBoxNodes
[] = "textNodes";
29 const char kTextNodeType
[] = "type";
30 const char kTextNodeText
[] = "text";
31 const char kTextNodeURL
[] = "url";
32 const char kTextNodeTypeText
[] = "text";
33 const char kTextNodeTypeURL
[] = "url";
34 const char kDocLinkURLPrefix
[] = "#page";
36 namespace chrome_pdf
{
38 PDFiumPage::PDFiumPage(PDFiumEngine
* engine
,
47 calculated_links_(false),
48 available_(available
) {
51 PDFiumPage::~PDFiumPage() {
54 void PDFiumPage::Unload() {
56 FPDFText_ClosePage(text_page_
);
61 if (engine_
->form()) {
62 FORM_OnBeforeClosePage(page_
, engine_
->form());
64 FPDF_ClosePage(page_
);
69 FPDF_PAGE
PDFiumPage::GetPage() {
70 ScopedUnsupportedFeature
scoped_unsupported_feature(engine_
);
74 page_
= FPDF_LoadPage(engine_
->doc(), index_
);
75 if (page_
&& engine_
->form()) {
76 FORM_OnAfterLoadPage(page_
, engine_
->form());
82 FPDF_PAGE
PDFiumPage::GetPrintPage() {
83 ScopedUnsupportedFeature
scoped_unsupported_feature(engine_
);
87 page_
= FPDF_LoadPage(engine_
->doc(), index_
);
91 void PDFiumPage::ClosePrintPage() {
93 FPDF_ClosePage(page_
);
98 FPDF_TEXTPAGE
PDFiumPage::GetTextPage() {
102 text_page_
= FPDFText_LoadPage(GetPage());
106 base::Value
* PDFiumPage::GetAccessibleContentAsValue(int rotation
) {
107 base::DictionaryValue
* node
= new base::DictionaryValue();
112 double width
= FPDF_GetPageWidth(GetPage());
113 double height
= FPDF_GetPageHeight(GetPage());
115 base::ListValue
* text
= new base::ListValue();
116 int box_count
= FPDFText_CountRects(GetTextPage(), 0, GetCharCount());
117 for (int i
= 0; i
< box_count
; i
++) {
118 double left
, top
, right
, bottom
;
119 FPDFText_GetRect(GetTextPage(), i
, &left
, &top
, &right
, &bottom
);
121 GetTextBoxAsValue(height
, left
, top
, right
, bottom
, rotation
));
124 node
->SetDouble(kPageWidth
, width
);
125 node
->SetDouble(kPageHeight
, height
);
126 node
->Set(kPageTextBox
, text
); // Takes ownership of |text|
131 base::Value
* PDFiumPage::GetTextBoxAsValue(double page_height
,
132 double left
, double top
,
133 double right
, double bottom
,
135 base::string16 text_utf16
;
137 FPDFText_GetBoundedText(GetTextPage(), left
, top
, right
, bottom
, NULL
, 0);
138 if (char_count
> 0) {
139 unsigned short* data
= reinterpret_cast<unsigned short*>(
140 WriteInto(&text_utf16
, char_count
+ 1));
141 FPDFText_GetBoundedText(GetTextPage(),
142 left
, top
, right
, bottom
,
145 std::string text_utf8
= base::UTF16ToUTF8(text_utf16
);
147 FPDF_LINK link
= FPDFLink_GetLinkAtPoint(GetPage(), left
, top
);
149 std::vector
<LinkTarget
> targets
;
151 targets
.push_back(LinkTarget());
152 area
= GetLinkTarget(link
, &targets
[0]);
155 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, rotation
));
156 GetLinks(rect
, &targets
);
157 area
= targets
.size() == 0 ? TEXT_AREA
: WEBLINK_AREA
;
160 int char_index
= FPDFText_GetCharIndexAtPos(GetTextPage(), left
, top
,
161 kTolerance
, kTolerance
);
162 double font_size
= FPDFText_GetFontSize(GetTextPage(), char_index
);
164 base::DictionaryValue
* node
= new base::DictionaryValue();
165 node
->SetDouble(kTextBoxLeft
, left
);
166 node
->SetDouble(kTextBoxTop
, page_height
- top
);
167 node
->SetDouble(kTextBoxWidth
, right
- left
);
168 node
->SetDouble(kTextBoxHeight
, top
- bottom
);
169 node
->SetDouble(kTextBoxFontSize
, font_size
);
171 base::ListValue
* text_nodes
= new base::ListValue();
173 if (area
== DOCLINK_AREA
) {
174 std::string url
= kDocLinkURLPrefix
+ base::IntToString(targets
[0].page
);
175 text_nodes
->Append(CreateURLNode(text_utf8
, url
));
176 } else if (area
== WEBLINK_AREA
&& link
) {
177 text_nodes
->Append(CreateURLNode(text_utf8
, targets
[0].url
));
178 } else if (area
== WEBLINK_AREA
&& !link
) {
180 for (size_t i
= 0; i
< targets
.size(); ++i
) {
181 // Remove the extra NULL character at end.
182 // Otherwise, find() will not return any matches.
183 if (targets
[i
].url
.size() > 0 &&
184 targets
[i
].url
[targets
[i
].url
.size() - 1] == '\0') {
185 targets
[i
].url
.resize(targets
[i
].url
.size() - 1);
187 // There should only ever be one NULL character
188 DCHECK(targets
[i
].url
[targets
[i
].url
.size() - 1] != '\0');
190 // PDFium may change the case of generated links.
191 std::string lowerCaseURL
= base::StringToLowerASCII(targets
[i
].url
);
192 std::string lowerCaseText
= base::StringToLowerASCII(text_utf8
);
193 size_t pos
= lowerCaseText
.find(lowerCaseURL
, start
);
194 size_t length
= targets
[i
].url
.size();
195 if (pos
== std::string::npos
) {
196 // Check if the link is a "mailto:" URL
197 if (lowerCaseURL
.compare(0, 7, "mailto:") == 0) {
198 pos
= lowerCaseText
.find(lowerCaseURL
.substr(7), start
);
202 if (pos
== std::string::npos
) {
203 // No match has been found. This should never happen.
208 std::string before_text
= text_utf8
.substr(start
, pos
- start
);
209 if (before_text
.size() > 0)
210 text_nodes
->Append(CreateTextNode(before_text
));
211 std::string link_text
= text_utf8
.substr(pos
, length
);
212 text_nodes
->Append(CreateURLNode(link_text
, targets
[i
].url
));
214 start
= pos
+ length
;
216 std::string before_text
= text_utf8
.substr(start
);
217 if (before_text
.size() > 0)
218 text_nodes
->Append(CreateTextNode(before_text
));
220 text_nodes
->Append(CreateTextNode(text_utf8
));
223 node
->Set(kTextBoxNodes
, text_nodes
); // Takes ownership of |text_nodes|.
227 base::Value
* PDFiumPage::CreateTextNode(std::string text
) {
228 base::DictionaryValue
* node
= new base::DictionaryValue();
229 node
->SetString(kTextNodeType
, kTextNodeTypeText
);
230 node
->SetString(kTextNodeText
, text
);
234 base::Value
* PDFiumPage::CreateURLNode(std::string text
, std::string url
) {
235 base::DictionaryValue
* node
= new base::DictionaryValue();
236 node
->SetString(kTextNodeType
, kTextNodeTypeURL
);
237 node
->SetString(kTextNodeText
, text
);
238 node
->SetString(kTextNodeURL
, url
);
242 PDFiumPage::Area
PDFiumPage::GetCharIndex(const pp::Point
& point
,
245 LinkTarget
* target
) {
247 return NONSELECTABLE_AREA
;
248 pp::Point point2
= point
- rect_
.point();
250 FPDF_DeviceToPage(GetPage(), 0, 0, rect_
.width(), rect_
.height(),
251 rotation
, point2
.x(), point2
.y(), &new_x
, &new_y
);
253 int rv
= FPDFText_GetCharIndexAtPos(
254 GetTextPage(), new_x
, new_y
, kTolerance
, kTolerance
);
257 FPDF_LINK link
= FPDFLink_GetLinkAtPoint(GetPage(), new_x
, new_y
);
259 // We don't handle all possible link types of the PDF. For example,
260 // launch actions, cross-document links, etc.
261 // In that case, GetLinkTarget() will return NONSELECTABLE_AREA
262 // and we should proceed with area detection.
263 PDFiumPage::Area area
= GetLinkTarget(link
, target
);
264 if (area
!= PDFiumPage::NONSELECTABLE_AREA
)
269 return NONSELECTABLE_AREA
;
271 return GetLink(*char_index
, target
) != -1 ? WEBLINK_AREA
: TEXT_AREA
;
274 base::char16
PDFiumPage::GetCharAtIndex(int index
) {
277 return static_cast<base::char16
>(FPDFText_GetUnicode(GetTextPage(), index
));
280 int PDFiumPage::GetCharCount() {
283 return FPDFText_CountChars(GetTextPage());
286 PDFiumPage::Area
PDFiumPage::GetLinkTarget(
287 FPDF_LINK link
, PDFiumPage::LinkTarget
* target
) {
288 FPDF_DEST dest
= FPDFLink_GetDest(engine_
->doc(), link
);
290 return GetDestinationTarget(dest
, target
);
292 FPDF_ACTION action
= FPDFLink_GetAction(link
);
294 switch (FPDFAction_GetType(action
)) {
295 case PDFACTION_GOTO
: {
296 FPDF_DEST dest
= FPDFAction_GetDest(engine_
->doc(), action
);
298 return GetDestinationTarget(dest
, target
);
299 // TODO(gene): We don't fully support all types of the in-document
300 // links. Need to implement that. There is a bug to track that:
301 // http://code.google.com/p/chromium/issues/detail?id=55776
303 case PDFACTION_URI
: {
306 FPDFAction_GetURIPath(engine_
->doc(), action
, NULL
, 0);
307 if (buffer_size
> 1) {
308 void* data
= WriteInto(&target
->url
, buffer_size
+ 1);
309 FPDFAction_GetURIPath(engine_
->doc(), action
, data
, buffer_size
);
314 // TODO(gene): We don't support PDFACTION_REMOTEGOTO and PDFACTION_LAUNCH
319 return NONSELECTABLE_AREA
;
322 PDFiumPage::Area
PDFiumPage::GetDestinationTarget(
323 FPDF_DEST destination
, PDFiumPage::LinkTarget
* target
) {
324 int page_index
= FPDFDest_GetPageIndex(engine_
->doc(), destination
);
326 target
->page
= page_index
;
331 int PDFiumPage::GetLink(int char_index
, PDFiumPage::LinkTarget
* target
) {
337 // Get the bounding box of the rect again, since it might have moved because
338 // of the tolerance above.
339 double left
, right
, bottom
, top
;
340 FPDFText_GetCharBox(GetTextPage(), char_index
, &left
, &right
, &bottom
, &top
);
343 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, 0).point());
344 for (size_t i
= 0; i
< links_
.size(); ++i
) {
345 for (size_t j
= 0; j
< links_
[i
].rects
.size(); ++j
) {
346 if (links_
[i
].rects
[j
].Contains(origin
)) {
348 target
->url
= links_
[i
].url
;
356 std::vector
<int> PDFiumPage::GetLinks(pp::Rect text_area
,
357 std::vector
<LinkTarget
>* targets
) {
359 return std::vector
<int>();
363 std::vector
<int> links
;
365 for (size_t i
= 0; i
< links_
.size(); ++i
) {
366 for (size_t j
= 0; j
< links_
[i
].rects
.size(); ++j
) {
367 if (links_
[i
].rects
[j
].Intersects(text_area
)) {
370 target
.url
= links_
[i
].url
;
371 targets
->push_back(target
);
380 void PDFiumPage::CalculateLinks() {
381 if (calculated_links_
)
384 calculated_links_
= true;
385 FPDF_PAGELINK links
= FPDFLink_LoadWebLinks(GetTextPage());
386 int count
= FPDFLink_CountWebLinks(links
);
387 for (int i
= 0; i
< count
; ++i
) {
389 int url_length
= FPDFLink_GetURL(links
, i
, NULL
, 0);
390 if (url_length
> 1) { // WriteInto needs at least 2 characters.
391 unsigned short* data
=
392 reinterpret_cast<unsigned short*>(WriteInto(&url
, url_length
+ 1));
393 FPDFLink_GetURL(links
, i
, data
, url_length
);
396 link
.url
= base::UTF16ToUTF8(url
);
398 // If the link cannot be converted to a pp::Var, then it is not possible to
399 // pass it to JS. In this case, ignore the link like other PDF viewers.
400 // See http://crbug.com/312882 for an example.
401 pp::Var
link_var(link
.url
);
402 if (!link_var
.is_string())
405 // Make sure all the characters in the URL are valid per RFC 1738.
406 // http://crbug.com/340326 has a sample bad PDF.
407 // GURL does not work correctly, e.g. it just strips \t \r \n.
408 bool is_invalid_url
= false;
409 for (size_t j
= 0; j
< link
.url
.length(); ++j
) {
410 // Control characters are not allowed.
411 // 0x7F is also a control character.
412 // 0x80 and above are not in US-ASCII.
413 if (link
.url
[j
] < ' ' || link
.url
[j
] >= '\x7F') {
414 is_invalid_url
= true;
421 int rect_count
= FPDFLink_CountRects(links
, i
);
422 for (int j
= 0; j
< rect_count
; ++j
) {
423 double left
, top
, right
, bottom
;
424 FPDFLink_GetRect(links
, i
, j
, &left
, &top
, &right
, &bottom
);
425 link
.rects
.push_back(
426 PageToScreen(pp::Point(), 1.0, left
, top
, right
, bottom
, 0));
428 links_
.push_back(link
);
430 FPDFLink_CloseWebLinks(links
);
433 pp::Rect
PDFiumPage::PageToScreen(const pp::Point
& offset
,
443 int new_left
, new_top
, new_right
, new_bottom
;
446 static_cast<int>((rect_
.x() - offset
.x()) * zoom
),
447 static_cast<int>((rect_
.y() - offset
.y()) * zoom
),
448 static_cast<int>(ceil(rect_
.width() * zoom
)),
449 static_cast<int>(ceil(rect_
.height() * zoom
)),
450 rotation
, left
, top
, &new_left
, &new_top
);
453 static_cast<int>((rect_
.x() - offset
.x()) * zoom
),
454 static_cast<int>((rect_
.y() - offset
.y()) * zoom
),
455 static_cast<int>(ceil(rect_
.width() * zoom
)),
456 static_cast<int>(ceil(rect_
.height() * zoom
)),
457 rotation
, right
, bottom
, &new_right
, &new_bottom
);
459 // If the PDF is rotated, the horizontal/vertical coordinates could be
461 // http://www.netl.doe.gov/publications/proceedings/03/ubc/presentations/Goeckner-pres.pdf
462 if (new_right
< new_left
)
463 std::swap(new_right
, new_left
);
464 if (new_bottom
< new_top
)
465 std::swap(new_bottom
, new_top
);
468 new_left
, new_top
, new_right
- new_left
+ 1, new_bottom
- new_top
+ 1);
471 PDFiumPage::Link::Link() {
474 PDFiumPage::Link::~Link() {
477 } // namespace chrome_pdf