ozone: evdev: Sync caps lock LED state to evdev
[chromium-blink-merge.git] / third_party / readability / js / readability.js
blob4308093edbb2e69e5f1464f72091a85180c140ff
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // Local modifications to this file are described in the README.chromium
6 // file.
8 var dbg = (typeof console !== 'undefined') ? function(s) {
9 console.log("Readability: " + s);
10 } : function() {};
13 * Readability. An Arc90 Lab Experiment.
14 * Website: http://lab.arc90.com/experiments/readability
15 * Source: http://code.google.com/p/arc90labs-readability
17 * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
19 * Copyright (c) 2010 Arc90 Inc
20 * Readability is licensed under the Apache License, Version 2.0.
21 **/
22 var readability = {
23 readStyle: "style-newspaper",
24 readSize: "size-medium",
25 readMargin: "margin-wide",
27 distilledHTML: '',
28 distilledArticleContent: null,
29 nextPageLink: '',
31 version: '1.7.1',
32 iframeLoads: 0,
33 convertLinksToFootnotes: false,
34 reversePageScroll: false, /* If they hold shift and hit space, scroll up */
35 frameHack: false, /**
36 * The frame hack is to workaround a firefox bug where if you
37 * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
38 * So we fake a scrollbar in the wrapping div.
39 **/
40 biggestFrame: false,
41 flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */
43 /* constants */
44 FLAG_STRIP_UNLIKELYS: 0x1,
45 FLAG_WEIGHT_CLASSES: 0x2,
46 FLAG_CLEAN_CONDITIONALLY: 0x4,
48 maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
49 parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
50 pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
52 /**
53 * All of the regular expressions in use within readability.
54 * Defined up here so we don't instantiate them repeatedly in loops.
55 **/
56 regexps: {
57 unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
58 okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
59 positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
60 negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
61 extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
62 divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
63 replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
64 replaceFonts: /<(\/?)font[^>]*>/gi,
65 trim: /^\s+|\s+$/g,
66 normalize: /\s{2,}/g,
67 killBreaks: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
68 videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
69 skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
70 nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
71 prevLink: /(prev|earl|old|new|<|«)/i
74 /**
75 * Runs readability.
77 * Workflow:
78 * 1. Prep the document by removing script tags, css, etc.
79 * 2. Build readability's DOM tree.
80 * 3. Grab the article content from the current dom tree.
81 * 4. Replace the current DOM tree with the new one.
82 * 5. Read peacefully.
84 * @return void
85 **/
86 init: function() {
87 /* Before we do anything, remove all scripts that are not readability. */
88 window.onload = window.onunload = function() {};
90 readability.removeScripts(document);
92 /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
93 readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
95 /* Pull out any possible next page link first */
96 readability.nextPageLink = readability.findNextPageLink(document.body);
98 /* We handle processing of nextPage from C++ set nextPageLink to null */
99 var nextPageLink = null;
101 readability.prepDocument();
103 /* Build readability's DOM tree */
104 var overlay = document.createElement("DIV");
105 var innerDiv = document.createElement("DIV");
106 var articleTools = readability.getArticleTools();
107 var articleTitleText = readability.getArticleTitle();
108 var articleContent = readability.grabArticle();
110 if(!articleContent) {
111 articleContent = document.createElement("DIV");
112 articleContent.id = "readability-content";
113 articleContent.innerHTML = [
114 "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>",
115 (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""),
116 "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>"
117 ].join('');
119 nextPageLink = null;
122 overlay.id = "readOverlay";
123 innerDiv.id = "readInner";
125 /* Apply user-selected styling */
126 document.body.className = readability.readStyle;
127 document.dir = readability.getSuggestedDirection(articleTitleText);
129 if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){
130 overlay.className = readability.readStyle + " rdbTypekit";
131 } else {
132 overlay.className = readability.readStyle;
134 innerDiv.className = readability.readMargin + " " + readability.readSize;
136 if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) {
137 readability.convertLinksToFootnotes = true;
140 readability.distilledHTML = articleContent.innerHTML;
142 if(readability.frameHack) {
143 var readOverlay = document.getElementById('readOverlay');
144 readOverlay.style.height = '100%';
145 readOverlay.style.overflow = 'auto';
149 * If someone tries to use Readability on a site's root page, give them a warning about usage.
151 if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) {
152 articleContent.style.display = "none";
153 var rootWarning = document.createElement('p');
154 rootWarning.id = "readability-warning";
155 rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
156 "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
158 innerDiv.insertBefore( rootWarning, articleContent );
161 readability.postProcessContent(articleContent);
163 window.scrollTo(0, 0);
165 if (nextPageLink) {
167 * Append any additional pages after a small timeout so that people
168 * can start reading without having to wait for this to finish processing.
170 window.setTimeout(function() {
171 readability.appendNextPage(nextPageLink);
172 }, 500);
175 /** Smooth scrolling **/
176 document.onkeydown = function(e) {
177 var code = (window.event) ? event.keyCode : e.keyCode;
178 if (code === 16) {
179 readability.reversePageScroll = true;
180 return;
183 if (code === 32) {
184 readability.curScrollStep = 0;
185 var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
187 if(readability.reversePageScroll) {
188 readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
190 else {
191 readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
194 return false;
198 document.onkeyup = function(e) {
199 var code = (window.event) ? event.keyCode : e.keyCode;
200 if (code === 16) {
201 readability.reversePageScroll = false;
202 return;
208 * Run any post-process modifications to article content as necessary.
210 * @param Element
211 * @return void
213 postProcessContent: function(articleContent) {
214 if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) {
215 readability.addFootnotes(articleContent);
218 readability.fixImageFloats(articleContent);
222 * Some content ends up looking ugly if the image is too large to be floated.
223 * If the image is wider than a threshold (currently 55%), no longer float it,
224 * center it instead.
226 * @param Element
227 * @return void
229 fixImageFloats: function (articleContent) {
230 var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55,
231 images = articleContent.getElementsByTagName('img');
233 for(var i=0, il = images.length; i < il; i+=1) {
234 var image = images[i];
236 if(image.offsetWidth > imageWidthThreshold) {
237 image.className += " blockImage";
243 * Get the article tools Element that has buttons like reload, print.
245 * @return void
247 getArticleTools: function () {
248 var articleTools = document.createElement("DIV");
250 articleTools.id = "readTools";
251 articleTools.innerHTML =
252 "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
253 "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
254 "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
256 return articleTools;
260 * retuns the suggested direction of the string
262 * @return "rtl" || "ltr"
264 getSuggestedDirection: function(text) {
265 function sanitizeText() {
266 return text.replace(/@\w+/, "");
269 function countMatches(match) {
270 var matches = text.match(new RegExp(match, "g"));
271 return matches !== null ? matches.length : 0;
274 function isRTL() {
275 var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
276 var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
278 // if 20% of chars are Hebrew or Arbic then direction is rtl
279 return (count_heb + count_arb) * 100 / text.length > 20;
282 text = sanitizeText(text);
283 return isRTL() ? "rtl" : "ltr";
287 * Get the article title as an H1.
289 * @return void
291 getArticleTitle: function () {
292 var curTitle = "",
293 origTitle = "";
295 try {
296 curTitle = origTitle = document.title;
297 if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
298 curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
301 catch(e) {}
303 if(curTitle.match(/ [\|\-] /))
305 curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
307 if(curTitle.split(' ').length < 3) {
308 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
311 else if(curTitle.indexOf(': ') !== -1)
313 curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
315 if(curTitle.split(' ').length < 3) {
316 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
319 else if(curTitle.length > 150 || curTitle.length < 15)
321 var hOnes = document.getElementsByTagName('h1');
322 if(hOnes.length === 1)
324 curTitle = readability.getInnerText(hOnes[0]);
328 curTitle = curTitle.replace( readability.regexps.trim, "" );
330 if(curTitle.split(' ').length <= 4) {
331 curTitle = origTitle;
333 return curTitle;
337 * Prepare the HTML document for readability to scrape it.
338 * This includes things like stripping javascript, CSS, and handling terrible markup.
340 * @return void
342 prepDocument: function () {
344 * In some cases a body element can't be found (if the HTML is totally hosed for example)
345 * so we create a new body node and append it to the document.
347 if(document.body === null)
349 var body = document.createElement("body");
350 try {
351 document.body = body;
353 catch(e) {
354 document.documentElement.appendChild(body);
355 dbg(e);
359 document.body.id = "readabilityBody";
361 var frames = document.getElementsByTagName('frame');
362 if(frames.length > 0)
364 var bestFrame = null;
365 var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */
366 var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */
367 for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1)
369 var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
370 var canAccessFrame = false;
371 try {
372 var frameBody = frames[frameIndex].contentWindow.document.body;
373 canAccessFrame = true;
375 catch(eFrames) {
376 dbg(eFrames);
379 if(frameSize > biggestFrameSize) {
380 biggestFrameSize = frameSize;
381 readability.biggestFrame = frames[frameIndex];
384 if(canAccessFrame && frameSize > bestFrameSize)
386 readability.frameHack = true;
388 bestFrame = frames[frameIndex];
389 bestFrameSize = frameSize;
393 if(bestFrame)
395 var newBody = document.createElement('body');
396 readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody);
397 newBody.style.overflow = 'scroll';
398 document.body = newBody;
400 var frameset = document.getElementsByTagName('frameset')[0];
401 if(frameset) {
402 frameset.parentNode.removeChild(frameset); }
406 /* Remove all stylesheets */
407 for (var k=0;k < document.styleSheets.length; k+=1) {
408 if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) {
409 document.styleSheets[k].disabled = true;
413 /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
414 var styleTags = document.getElementsByTagName("style");
415 for (var st=0;st < styleTags.length; st+=1) {
416 styleTags[st].textContent = "";
419 /* Turn all double br's into p's */
420 /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
421 readability.replaceDoubleBrsWithPs(document.body);
422 readability.replaceFontsWithSpans(document.body);
427 * Prepare the article node for display. Clean out any inline styles,
428 * iframes, forms, strip extraneous <p> tags, etc.
430 * @param Element
431 * @return void
433 prepArticle: function (articleContent) {
434 readability.cleanStyles(articleContent);
435 readability.killBreaks(articleContent);
437 /* Clean out junk from the article content */
438 readability.cleanConditionally(articleContent, "form");
439 readability.clean(articleContent, "object");
440 readability.clean(articleContent, "h1");
443 * If there is only one h2, they are probably using it
444 * as a header and not a subheader, so remove it since we already have a header.
445 ***/
446 if(articleContent.getElementsByTagName('h2').length === 1) {
447 readability.clean(articleContent, "h2");
449 readability.clean(articleContent, "iframe");
451 readability.cleanHeaders(articleContent);
453 /* Do these last as the previous stuff may have removed junk that will affect these */
454 readability.cleanConditionally(articleContent, "table");
455 readability.cleanConditionally(articleContent, "ul");
456 readability.cleanConditionally(articleContent, "div");
458 /* Remove extra paragraphs */
459 var articleParagraphs = articleContent.getElementsByTagName('p');
460 for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
461 var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
462 var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
463 var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
465 if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
466 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
470 try {
471 readability.replaceBrsWithPs(articleContent);
473 catch (e) {
474 dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
479 * Initialize a node with the readability object. Also checks the
480 * className/id for special names to add to its score.
482 * @param Element
483 * @return void
485 initializeNode: function (node) {
486 node.readability = {"contentScore": 0};
488 switch(node.tagName) {
489 case 'DIV':
490 node.readability.contentScore += 5;
491 break;
493 case 'PRE':
494 case 'TD':
495 case 'BLOCKQUOTE':
496 node.readability.contentScore += 3;
497 break;
499 case 'ADDRESS':
500 case 'OL':
501 case 'UL':
502 case 'DL':
503 case 'DD':
504 case 'DT':
505 case 'LI':
506 case 'FORM':
507 node.readability.contentScore -= 3;
508 break;
510 case 'H1':
511 case 'H2':
512 case 'H3':
513 case 'H4':
514 case 'H5':
515 case 'H6':
516 case 'TH':
517 node.readability.contentScore -= 5;
518 break;
521 node.readability.contentScore += readability.getClassWeight(node);
524 /***
525 * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
526 * most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
528 * @param page a document to run upon. Needs to be a full document, complete with body.
529 * @return Element
531 grabArticle: function (pageToClone) {
532 var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
533 isPaging = (page !== null) ? true: false;
535 var page = null;
536 // Never work on the actual page.
537 if (isPaging) {
538 page = document.body.cloneNode(true);
539 } else {
540 page = pageToClone.cloneNode(true);
543 var allElements = page.getElementsByTagName('*');
546 * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
547 * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
549 * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
550 * TODO: Shouldn't this be a reverse traversal?
552 var node = null;
553 var nodesToScore = [];
554 for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
555 /* Remove unlikely candidates */
556 if (stripUnlikelyCandidates) {
557 var unlikelyMatchString = node.className + node.id;
558 if (
560 unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
561 unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
562 node.tagName !== "BODY"
566 dbg("Removing unlikely candidate - " + unlikelyMatchString);
567 node.parentNode.removeChild(node);
568 nodeIndex-=1;
569 continue;
573 if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
574 nodesToScore[nodesToScore.length] = node;
577 /* Turn all divs that don't have children block level elements into p's */
578 if (node.tagName === "DIV") {
579 if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
580 var newNode = document.createElement('p');
581 try {
582 readability.moveNodeInnards(node, newNode);
583 node.parentNode.replaceChild(newNode, node);
584 nodeIndex-=1;
586 nodesToScore[nodesToScore.length] = node;
588 catch(e) {
589 dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
592 else
594 /* EXPERIMENTAL */
595 for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
596 var childNode = node.childNodes[i];
597 if(childNode.nodeType === 3) { // Node.TEXT_NODE
598 var p = document.createElement('p');
599 var t = document.createTextNode(childNode.nodeValue);
600 p.appendChild(t);
601 p.style.display = 'inline';
602 p.className = 'readability-styled';
603 childNode.parentNode.replaceChild(p, childNode);
611 * Loop through all paragraphs, and assign a score to them based on how content-y they look.
612 * Then add their score to their parent node.
614 * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
616 var candidates = [];
617 for (var pt=0; pt < nodesToScore.length; pt+=1) {
618 var parentNode = nodesToScore[pt].parentNode;
619 var grandParentNode = parentNode ? parentNode.parentNode : null;
620 var innerText = readability.getInnerText(nodesToScore[pt]);
622 if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
623 continue;
626 /* If this paragraph is less than 25 characters, don't even count it. */
627 if(innerText.length < 25) {
628 continue; }
630 /* Initialize readability data for the parent. */
631 if(typeof parentNode.readability === 'undefined') {
632 readability.initializeNode(parentNode);
633 candidates.push(parentNode);
636 /* Initialize readability data for the grandparent. */
637 if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
638 readability.initializeNode(grandParentNode);
639 candidates.push(grandParentNode);
642 var contentScore = 0;
644 /* Add a point for the paragraph itself as a base. */
645 contentScore+=1;
647 /* Add points for any commas within this paragraph */
648 contentScore += innerText.split(',').length;
650 /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
651 contentScore += Math.min(Math.floor(innerText.length / 100), 3);
653 /* Add the score to the parent. The grandparent gets half. */
654 parentNode.readability.contentScore += contentScore;
656 if(grandParentNode) {
657 grandParentNode.readability.contentScore += contentScore/2;
662 * After we've calculated scores, loop through all of the possible candidate nodes we found
663 * and find the one with the highest score.
665 var topCandidate = null;
666 for(var c=0, cl=candidates.length; c < cl; c+=1)
669 * Scale the final candidates score based on link density. Good content should have a
670 * relatively small link density (5% or less) and be mostly unaffected by this operation.
672 candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
674 dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
676 if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
677 topCandidate = candidates[c]; }
681 * If we still have no top candidate, just use the body as a last resort.
682 * We also have to copy the body node so it is something we can modify.
684 if (topCandidate === null || topCandidate.tagName === "BODY")
686 topCandidate = document.createElement("DIV");
687 readability.replaceNodeInnards(page, topCandidate);
688 page.appendChild(topCandidate);
689 readability.initializeNode(topCandidate);
693 * Now that we have the top candidate, look through its siblings for content that might also be related.
694 * Things like preambles, content split by ads that we removed, etc.
696 var articleContent = document.createElement("DIV");
697 if (isPaging) {
698 articleContent.id = "readability-content";
700 var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
701 var siblingNodes = topCandidate.parentNode.childNodes;
704 for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
705 var siblingNode = siblingNodes[s];
706 var append = false;
709 * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
710 * Example of error visible here: http://www.esquire.com/features/honesty0707
712 if(!siblingNode) {
713 continue;
716 dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
717 dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
719 if(siblingNode === topCandidate)
721 append = true;
724 var contentBonus = 0;
725 /* Give a bonus if sibling nodes and top candidates have the example same classname */
726 if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
727 contentBonus += topCandidate.readability.contentScore * 0.2;
730 if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
732 append = true;
735 if(siblingNode.nodeName === "P") {
736 var linkDensity = readability.getLinkDensity(siblingNode);
737 var nodeContent = readability.getInnerText(siblingNode);
738 var nodeLength = nodeContent.length;
740 if(nodeLength > 80 && linkDensity < 0.25)
742 append = true;
744 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
746 append = true;
750 if(append) {
751 dbg("Appending node: " + siblingNode);
753 var nodeToAppend = null;
754 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
755 /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
757 dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
758 nodeToAppend = document.createElement("DIV");
759 try {
760 nodeToAppend.id = siblingNode.id;
761 readability.moveNodeInnards(siblingNode, nodeToAppend);
763 catch(er) {
764 dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
765 nodeToAppend = siblingNode;
766 s-=1;
767 sl-=1;
769 } else {
770 nodeToAppend = siblingNode;
771 s-=1;
772 sl-=1;
775 /* To ensure a node does not interfere with readability styles, remove its classnames */
776 nodeToAppend.className = "";
778 /* Append sibling and subtract from our list because it removes the node when you append to another node */
779 articleContent.appendChild(nodeToAppend);
784 * So we have all of the content that we need. Now we clean it up for presentation.
786 readability.distilledArticleContent = articleContent.cloneNode(true);
787 //readability.prepArticle(articleContent);
789 if (readability.curPageNum === 1) {
790 var newNode = document.createElement('div');
791 newNode.id = "readability-page-1";
792 newNode.setAttribute("class", "page");
793 readability.moveNodeInnards(articleContent, newNode);
794 articleContent.appendChild(newNode);
798 * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
799 * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
800 * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
801 * finding the -right- content.
803 if(readability.getInnerText(articleContent, false).length < 250) {
804 if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
805 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
806 return readability.grabArticle(document.body);
808 else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
809 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
810 return readability.grabArticle(document.body);
812 else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
813 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
814 return readability.grabArticle(document.body);
815 } else {
816 return null;
820 return articleContent;
824 * Removes script tags from the document.
826 * @param Element
828 removeScripts: function (doc) {
829 var scripts = doc.getElementsByTagName('script');
830 for(var i = scripts.length-1; i >= 0; i-=1)
832 if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
834 scripts[i].nodeValue="";
835 scripts[i].removeAttribute('src');
836 if (scripts[i].parentNode) {
837 scripts[i].parentNode.removeChild(scripts[i]);
844 * Get the inner text of a node - cross browser compatibly.
845 * This also strips out any excess whitespace to be found.
847 * @param Element
848 * @return string
850 getInnerText: function (e, normalizeSpaces) {
851 var textContent = "";
853 if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") {
854 return "";
857 normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
859 if (navigator.appName === "Microsoft Internet Explorer") {
860 textContent = e.innerText.replace( readability.regexps.trim, "" ); }
861 else {
862 textContent = e.textContent.replace( readability.regexps.trim, "" ); }
864 if(normalizeSpaces) {
865 return textContent.replace( readability.regexps.normalize, " "); }
866 else {
867 return textContent; }
871 * Get the number of times a string s appears in the node e.
873 * @param Element
874 * @param string - what to split on. Default is ","
875 * @return number (integer)
877 getCharCount: function (e,s) {
878 s = s || ",";
879 return readability.getInnerText(e).split(s).length-1;
883 * Remove the style attribute on every e and under.
884 * TODO: Test if getElementsByTagName(*) is faster.
886 * @param Element
887 * @return void
889 cleanStyles: function (e) {
890 e = e || document;
891 var cur = e.firstChild;
893 if(!e) {
894 return; }
896 // Remove any root styles, if we're able.
897 if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') {
898 e.removeAttribute('style'); }
900 // Go until there are no more child nodes
901 while ( cur !== null ) {
902 if ( cur.nodeType === 1 ) {
903 // Remove style attribute(s) :
904 if(cur.className !== "readability-styled") {
905 cur.removeAttribute("style");
907 readability.cleanStyles( cur );
909 cur = cur.nextSibling;
914 * Get the density of links as a percentage of the content
915 * This is the amount of text that is inside a link divided by the total text in the node.
917 * @param Element
918 * @return number (float)
920 getLinkDensity: function (e) {
921 var links = e.getElementsByTagName("a");
922 var textLength = readability.getInnerText(e).length;
923 var linkLength = 0;
924 for(var i=0, il=links.length; i<il;i+=1)
926 linkLength += readability.getInnerText(links[i]).length;
929 return linkLength / textLength;
933 * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
935 * @author Dan Lacy
936 * @return string the base url
938 findBaseUrl: function () {
939 var noUrlParams = window.location.pathname.split("?")[0],
940 urlSlashes = noUrlParams.split("/").reverse(),
941 cleanedSegments = [],
942 possibleType = "";
944 for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
945 var segment = urlSlashes[i];
947 // Split off and save anything that looks like a file type.
948 if (segment.indexOf(".") !== -1) {
949 possibleType = segment.split(".")[1];
951 /* If the type isn't alpha-only, it's probably not actually a file extension. */
952 if(!possibleType.match(/[^a-zA-Z]/)) {
953 segment = segment.split(".")[0];
958 * EW-CMS specific segment replacement. Ugly.
959 * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
961 if(segment.indexOf(',00') !== -1) {
962 segment = segment.replace(',00', '');
965 // If our first or second segment has anything looking like a page number, remove it.
966 if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
967 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
971 var del = false;
973 /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
974 if (i < 2 && segment.match(/^\d{1,2}$/)) {
975 del = true;
978 /* If this is the first segment and it's just "index", remove it. */
979 if(i === 0 && segment.toLowerCase() === "index") {
980 del = true;
984 /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
985 if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
986 del = true;
989 /* If it's not marked for deletion, push it to cleanedSegments. */
990 if (!del) {
991 cleanedSegments.push(segment);
995 // This is our final, cleaned, base article URL.
996 return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/");
1000 * Look for any paging links that may occur within the document.
1002 * @param body
1003 * @return object (array)
1005 findNextPageLink: function (elem) {
1006 var possiblePages = {},
1007 allLinks = elem.getElementsByTagName('a'),
1008 articleBaseUrl = readability.findBaseUrl();
1011 * Loop through all links, looking for hints that they may be next-page links.
1012 * Things like having "page" in their textContent, className or id, or being a child
1013 * of a node with a page-y className or id.
1015 * Also possible: levenshtein distance? longest common subsequence?
1017 * After we do that, assign each page a score, and
1019 for(var i = 0, il = allLinks.length; i < il; i+=1) {
1020 var link = allLinks[i],
1021 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
1023 /* If we've already seen this page, ignore it */
1024 if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
1025 continue;
1028 /* If it's on a different domain, skip it. */
1029 if(window.location.host !== linkHref.split(/\/+/g)[1]) {
1030 continue;
1033 var linkText = readability.getInnerText(link);
1035 /* If the linkText looks like it's not the next page, skip it. */
1036 if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
1037 continue;
1040 /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
1041 var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1042 if(!linkHrefLeftover.match(/\d/)) {
1043 continue;
1046 if(!(linkHref in possiblePages)) {
1047 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
1048 } else {
1049 possiblePages[linkHref].linkText += ' | ' + linkText;
1052 var linkObj = possiblePages[linkHref];
1055 * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
1056 * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1058 if(linkHref.indexOf(articleBaseUrl) !== 0) {
1059 linkObj.score -= 25;
1062 var linkData = linkText + ' ' + link.className + ' ' + link.id;
1063 if(linkData.match(readability.regexps.nextLink)) {
1064 linkObj.score += 50;
1066 if(linkData.match(/pag(e|ing|inat)/i)) {
1067 linkObj.score += 25;
1069 if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
1070 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
1071 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
1072 linkObj.score -= 65;
1075 if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) {
1076 linkObj.score -= 50;
1078 if(linkData.match(readability.regexps.prevLink)) {
1079 linkObj.score -= 200;
1082 /* If a parentNode contains page or paging or paginat */
1083 var parentNode = link.parentNode,
1084 positiveNodeMatch = false,
1085 negativeNodeMatch = false;
1086 while(parentNode) {
1087 var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
1088 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
1089 positiveNodeMatch = true;
1090 linkObj.score += 25;
1092 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) {
1093 /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
1094 if(!parentNodeClassAndId.match(readability.regexps.positive)) {
1095 linkObj.score -= 25;
1096 negativeNodeMatch = true;
1100 parentNode = parentNode.parentNode;
1104 * If the URL looks like it has paging in it, add to the score.
1105 * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1107 if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
1108 linkObj.score += 25;
1111 /* If the URL contains negative values, give a slight decrease. */
1112 if (linkHref.match(readability.regexps.extraneous)) {
1113 linkObj.score -= 15;
1117 * Minor punishment to anything that doesn't match our current URL.
1118 * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
1119 * Dan, can you show me a counterexample where this is necessary?
1120 * if (linkHref.indexOf(window.location.href) !== 0) {
1121 * linkObj.score -= 1;
1126 * If the link text can be parsed as a number, give it a minor bonus, with a slight
1127 * bias towards lower numbered pages. This is so that pages that might not have 'next'
1128 * in their text can still get scored, and sorted properly by score.
1130 var linkTextAsNumber = parseInt(linkText, 10);
1131 if(linkTextAsNumber) {
1132 // Punish 1 since we're either already there, or it's probably before what we want anyways.
1133 if (linkTextAsNumber === 1) {
1134 linkObj.score -= 10;
1136 else {
1137 // Todo: Describe this better
1138 linkObj.score += Math.max(0, 10 - linkTextAsNumber);
1144 * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
1145 * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
1147 var topPage = null;
1148 for(var page in possiblePages) {
1149 if(possiblePages.hasOwnProperty(page)) {
1150 if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) {
1151 topPage = possiblePages[page];
1156 if(topPage) {
1157 var nextHref = topPage.href.replace(/\/$/,'');
1159 dbg('NEXT PAGE IS ' + nextHref);
1160 readability.parsedPages[nextHref] = true;
1161 return nextHref;
1163 else {
1164 return null;
1168 createLinkDiv: function(link) {
1169 var divNode = document.createElement('div');
1170 var aNode = document.createElement('a');
1171 var tNode = document.createTextNode('View Next Page');
1172 divNode.setAttribute('style', 'text-align: center');
1173 aNode.setAttribute('href', link);
1174 aNode.appendChild(tNode);
1175 divNode.appendChild(aNode);
1176 return divNode;
1179 xhr: function () {
1180 if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
1181 return new XMLHttpRequest();
1183 else {
1184 try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
1185 try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
1186 try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
1189 return false;
1192 successfulRequest: function (request) {
1193 return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText);
1196 ajax: function (url, options) {
1197 var request = readability.xhr();
1199 function respondToReadyState(readyState) {
1200 if (request.readyState === 4) {
1201 if (readability.successfulRequest(request)) {
1202 if (options.success) { options.success(request); }
1204 else {
1205 if (options.error) { options.error(request); }
1210 if (typeof options === 'undefined') { options = {}; }
1212 request.onreadystatechange = respondToReadyState;
1214 request.open('get', url, true);
1215 request.setRequestHeader('Accept', 'text/html');
1217 try {
1218 request.send(options.postBody);
1220 catch (e) {
1221 if (options.error) { options.error(); }
1224 return request;
1228 * Make an AJAX request for each page and append it to the document.
1230 curPageNum: 1,
1232 appendNextPage: function (nextPageLink) {
1233 readability.curPageNum+=1;
1235 var articlePage = document.createElement("DIV");
1236 articlePage.id = 'readability-page-' + readability.curPageNum;
1237 articlePage.className = 'page';
1238 articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">&sect;</p>';
1240 document.getElementById("readability-content").appendChild(articlePage);
1242 if(readability.curPageNum > readability.maxPages) {
1243 var linkDiv = readability.createLinkDiv(nextPageLink);
1245 articlePage.appendChild(linkDiv);
1246 return;
1250 * Now that we've built the article page DOM element, get the page content
1251 * asynchronously and load the cleaned content into the div we created for it.
1253 (function(pageUrl, thisPage) {
1254 readability.ajax(pageUrl, {
1255 success: function(r) {
1257 /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
1258 var eTag = r.getResponseHeader('ETag');
1259 if(eTag) {
1260 if(eTag in readability.pageETags) {
1261 dbg("Exact duplicate page found via ETag. Aborting.");
1262 articlePage.style.display = 'none';
1263 return;
1264 } else {
1265 readability.pageETags[eTag] = 1;
1269 // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
1270 var page = document.createElement("DIV");
1273 * Do some preprocessing to our HTML to make it ready for appending.
1274 * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
1275 * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript.
1276 * • Turn all double br's into p's - was handled by prepDocument in the original view.
1277 * Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages.
1279 var pageInnards = r.responseXML;
1280 readability.removeScripts(pageInnards);
1281 readability.replaceNoscriptsWithPs(pageInnards);
1282 readability.replaceDoubleBrsWithPs(pageInnards);
1283 readability.replaceFontsWithSpans(pageInnards);
1284 page.appendChild(pageInnards);
1288 * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
1290 readability.flags = 0x1 | 0x2 | 0x4;
1292 var nextPageLink = readability.findNextPageLink(page),
1293 content = readability.grabArticle(page);
1295 if(!content) {
1296 dbg("No content found in page to append. Aborting.");
1297 return;
1301 * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
1302 * Compare it against all of the the previous document's we've gotten. If the previous
1303 * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
1305 var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
1306 if(firstP && firstP.innerHTML.length > 100) {
1307 for(var i=1; i <= readability.curPageNum; i+=1) {
1308 var rPage = document.getElementById('readability-page-' + i);
1309 if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
1310 dbg('Duplicate of page ' + i + ' - skipping.');
1311 articlePage.style.display = 'none';
1312 readability.parsedPages[pageUrl] = true;
1313 return;
1318 readability.removeScripts(content);
1320 readability.moveNodeInnards(content, thisPage);
1323 * After the page has rendered, post process the content. This delay is necessary because,
1324 * in webkit at least, offsetWidth is not set in time to determine image width. We have to
1325 * wait a little bit for reflow to finish before we can fix floating images.
1327 window.setTimeout(
1328 function() { readability.postProcessContent(thisPage); },
1332 if(nextPageLink) {
1333 readability.appendNextPage(nextPageLink);
1337 }(nextPageLink, articlePage));
1341 * Get an elements class/id weight. Uses regular expressions to tell if this
1342 * element looks good or bad.
1344 * @param Element
1345 * @return number (Integer)
1347 getClassWeight: function (e) {
1348 if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
1349 return 0;
1352 var weight = 0;
1354 /* Look for a special classname */
1355 if (typeof(e.className) === 'string' && e.className !== '')
1357 if(e.className.search(readability.regexps.negative) !== -1) {
1358 weight -= 25; }
1360 if(e.className.search(readability.regexps.positive) !== -1) {
1361 weight += 25; }
1364 /* Look for a special ID */
1365 if (typeof(e.id) === 'string' && e.id !== '')
1367 if(e.id.search(readability.regexps.negative) !== -1) {
1368 weight -= 25; }
1370 if(e.id.search(readability.regexps.positive) !== -1) {
1371 weight += 25; }
1374 return weight;
1377 nodeIsVisible: function (node) {
1378 return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
1382 * Remove extraneous break tags from a node.
1384 * @param Element
1385 * @return void
1387 killBreaks: function (e) {
1388 var allElements = e.getElementsByTagName('*');
1389 while (i < allElements.length) {
1390 readability.deleteExtraBreaks(allElements[i]);
1391 i++;
1396 * Clean a node of all elements of type "tag".
1397 * (Unless it's a youtube/vimeo video. People love movies.)
1399 * @param Element
1400 * @param string tag to clean
1401 * @return void
1403 clean: function (e, tag) {
1404 var targetList = e.getElementsByTagName( tag );
1405 var isEmbed = (tag === 'object' || tag === 'embed');
1407 for (var y=targetList.length-1; y >= 0; y-=1) {
1408 /* Allow youtube and vimeo videos through as people usually want to see those. */
1409 if(isEmbed) {
1410 var attributeValues = "";
1411 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
1412 attributeValues += targetList[y].attributes[i].value + '|';
1415 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
1416 if (attributeValues.search(readability.regexps.videos) !== -1) {
1417 continue;
1420 /* Then check the elements inside this element for the same. */
1421 if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
1422 continue;
1427 targetList[y].parentNode.removeChild(targetList[y]);
1432 * Clean an element of all tags of type "tag" if they look fishy.
1433 * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
1435 * @return void
1437 cleanConditionally: function (e, tag) {
1439 if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
1440 return;
1443 var tagsList = e.getElementsByTagName(tag);
1444 var curTagsLength = tagsList.length;
1447 * Gather counts for other typical elements embedded within.
1448 * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1450 * TODO: Consider taking into account original contentScore here.
1452 for (var i=curTagsLength-1; i >= 0; i-=1) {
1453 var weight = readability.getClassWeight(tagsList[i]);
1454 var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
1456 dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
1458 if(weight+contentScore < 0)
1460 tagsList[i].parentNode.removeChild(tagsList[i]);
1462 else if ( readability.getCharCount(tagsList[i],',') < 10) {
1464 * If there are not very many commas, and the number of
1465 * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1467 var p = tagsList[i].getElementsByTagName("p").length;
1468 var img = tagsList[i].getElementsByTagName("img").length;
1469 var li = tagsList[i].getElementsByTagName("li").length-100;
1470 var input = tagsList[i].getElementsByTagName("input").length;
1472 var embedCount = 0;
1473 var embeds = tagsList[i].getElementsByTagName("embed");
1474 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
1475 if (embeds[ei].src.search(readability.regexps.videos) === -1) {
1476 embedCount+=1;
1480 var linkDensity = readability.getLinkDensity(tagsList[i]);
1481 var contentLength = readability.getInnerText(tagsList[i]).length;
1482 var toRemove = false;
1484 if ( img > p ) {
1485 toRemove = true;
1486 } else if(li > p && tag !== "ul" && tag !== "ol") {
1487 toRemove = true;
1488 } else if( input > Math.floor(p/3) ) {
1489 toRemove = true;
1490 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
1491 toRemove = true;
1492 } else if(weight < 25 && linkDensity > 0.2) {
1493 toRemove = true;
1494 } else if(weight >= 25 && linkDensity > 0.5) {
1495 toRemove = true;
1496 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1497 toRemove = true;
1500 if(toRemove) {
1501 tagsList[i].parentNode.removeChild(tagsList[i]);
1508 * Clean out spurious headers from an Element. Checks things like classnames and link density.
1510 * @param Element
1511 * @return void
1513 cleanHeaders: function (e) {
1514 for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) {
1515 var headers = e.getElementsByTagName('h' + headerIndex);
1516 for (var i=headers.length-1; i >=0; i-=1) {
1517 if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
1518 headers[i].parentNode.removeChild(headers[i]);
1524 flagIsActive: function(flag) {
1525 return (readability.flags & flag) > 0;
1528 addFlag: function(flag) {
1529 readability.flags = readability.flags | flag;
1532 removeFlag: function(flag) {
1533 readability.flags = readability.flags & ~flag;
1536 // Removes the children of |src| and appends them to |dest|.
1537 moveNodeInnards: function(src, dest) {
1538 try {
1539 while (src.firstChild) {
1540 dest.appendChild(src.removeChild(src.firstChild));
1542 } catch (e) {}
1545 // Returns true if the node is a whitespace text node.
1546 isWhitespaceNode: function(node) {
1547 if (node.nodeType == Node.TEXT_NODE) {
1548 if (node.data.trim().length == 0) {
1549 return true;
1552 return false;
1555 // Returns true if the node is a <BR>.
1556 isBrNode: function(node) {
1557 return (node.tagName === 'BR');
1561 // Returns the last <BR> node in a sequence of <BR> nodes that are only
1562 // separated by whitespace, or null if there are not at least two <BR> tags
1563 // in the sibling chain starting with |node|. Returns the second such <BR>
1564 // node if |restrictToTwo| is true.
1565 isMultipleBr: function(node, restrictToTwo) {
1566 var lastBr = null;
1567 if (!readability.isBrNode(node)) {
1568 return lastBr;
1570 var curr = node.nextSibling;
1571 while (curr) {
1572 if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) {
1573 lastBr = curr;
1574 curr = curr.nextSibling;
1575 if (restrictToTwo) {
1576 if (readability.isBrNode(lastBr)) {
1577 return lastBr;
1580 continue;
1582 break;
1584 return lastBr;
1587 // Removes all <BR> nodes except one and whitespace in between in a series
1588 // of <BR> nodes.
1589 deleteExtraBreaks: function(node) {
1590 var lastBr = readability.isMultipleBr(node, false);
1591 var ret = false;
1592 while (lastBr && lastBr != node) {
1593 var toRemove = lastBr;
1594 lastBr = lastBr.previousSibling;
1595 toRemove.parentNode.removeChild(toRemove);
1596 ret = true;
1598 return ret;
1601 // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
1602 // <P> node, and makes all next siblings of that pair children of <P>, up
1603 // until the next pair of <BR> nodes is reached.
1604 replaceDoubleBrWithP: function(node) {
1605 // Check that we are starting with a BR.
1606 var second = readability.isMultipleBr(node, true);
1607 if (!second) {
1608 return;
1610 // Make all next siblings of the second BR into children of a P.
1611 var p = document.createElement('p');
1612 var curr = second.nextSibling;
1613 while (curr) {
1614 if (readability.isMultipleBr(curr, true)) {
1615 break;
1617 var next = curr.nextSibling;
1618 p.appendChild(curr.parentNode.removeChild(curr));
1619 curr = next;
1621 var ret = curr;
1623 // Remove all nodes between the first and second BR.
1624 curr = node.nextSibling;
1625 while (curr && curr != second) {
1626 var next = curr.nextSibling;
1627 curr.parentNode.removeChild(curr);
1628 curr = next;
1630 // Remove the second BR.
1631 second.parentNode.removeChild(second);
1632 // Replace the first BR with the P.
1633 node.parentNode.replaceChild(p, node);
1635 return ret;
1638 // Returns true if the NodeList contains a double <BR>.
1639 hasDoubleBr: function(nodeList) {
1640 for (var i = 0; i < nodeList.length; nodeList++) {
1641 if (readability.isMultipleBr(nodeList[i], true)) {
1642 return true;
1645 return false;
1648 // Replaces double <BR> tags with <P> tags.
1649 replaceDoubleBrsWithPs: function(node) {
1650 var allElements = node.getElementsByTagName('BR');
1651 var node = null;
1652 while (allElements && allElements.length > 0 &&
1653 readability.hasDoubleBr(allElements)) {
1654 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
1655 var next = node;
1656 while (next = readability.replaceDoubleBrWithP(next));
1658 allElements = document.body.getElementsByTagName('BR');
1663 // Replaces a BR and the whitespace that follows it with a P.
1664 replaceBrWithP: function(node) {
1665 if (!readability.isBrNode(node)) {
1666 return;
1668 var p = document.createElement('p');
1669 var curr = node.nextSibling;
1670 while (curr && !isBrNode(curr)) {
1671 var next = curr.nextSibling;
1672 if (readability.isWhitespaceNode(curr)) {
1673 curr.parentNode.removeChild(curr);
1674 } else {
1675 p.appendChild(curr.parentNode.removeChild(curr));
1677 curr = next;
1679 node.parentNode.replaceChild(p, node);
1680 return curr;
1683 // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag
1684 // children of the <P>.
1685 replaceBrsWithPs: function(node) {
1686 var allElements = node.getElementsByTagName('BR');
1687 var node = null;
1688 while (allElements && allElements.length > 0) {
1689 for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
1690 var next = node;
1691 while (next = readability.replaceBrWithP(next));
1693 allElements = document.body.getElementsByTagName('BR');
1697 // Replaces any tag with any other tag.
1698 replaceTagsWithTags: function(node, srcTag, destTag) {
1699 var allElements = node.getElementsByTagName(srcTag);
1700 for (var i = 0; i < allElements.length; i++) {
1701 var dest = document.createElement(destTag);
1702 readability.moveNodeInnards(allElements[i], dest);
1703 allElements[i].parentNode.replaceChild(dest, allElements[i]);
1707 // Replaces all <noscript> tags with <p> tags.
1708 replaceNoscriptsWithPs: function(node) {
1709 readability.replaceTagsWithTags(node, 'noscript', 'p');
1712 // Replaces all <font> tags with <span> tags.
1713 replaceFontsWithSpans: function(node) {
1714 readability.replaceTagsWithTags(node, 'font', 'span');
1717 // Returns a list of image URLs in the distilled article.
1718 getImages : function() {
1719 var images = document.getElementsByTagName('img');
1720 var result = new Array(images.length);
1721 dbg("Number of images: " + images.length);
1722 for(i = 0; i < images.length; i++) {
1723 result[i] = images[i].src;
1724 dbg("Image: " + result[i]);
1726 return result;
1729 // Returns the distilled article HTML from the page(s).
1730 getDistilledArticleHTML : function() {
1731 return readability.distilledHTML;
1734 // Returns the next page of this article.
1735 getNextPageLink : function() {
1736 return readability.nextPageLink;