third_party/readability/js/readability.js

   1 // Copyright 2014 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 // Local modifications to this file are described in the README.chromium
   6 // file.
   7
   8 var dbg = (typeof console !== 'undefined') ? function(s) {
   9     console.log("Readability: " + s);
  10 } : function() {};
  11
  12 /*
  13  * Readability. An Arc90 Lab Experiment.
  14  * Website: http://lab.arc90.com/experiments/readability
  15  * Source:  http://code.google.com/p/arc90labs-readability
  16  *
  17  * "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
  18  *
  19  * Copyright (c) 2010 Arc90 Inc
  20  * Readability is licensed under the Apache License, Version 2.0.
  21 **/
  22 var readability = {
  23     readStyle: "style-newspaper",
  24     readSize: "size-medium",
  25     readMargin: "margin-wide",
  26
  27     distilledHTML: '',
  28     distilledArticleContent: null,
  29     nextPageLink: '',
  30
  31     version:                '1.7.1',
  32     iframeLoads:             0,
  33     convertLinksToFootnotes: false,
  34     reversePageScroll:       false, /* If they hold shift and hit space, scroll up */
  35     frameHack:               false, /**
  36                                       * The frame hack is to workaround a firefox bug where if you
  37                                       * pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
  38                                       * So we fake a scrollbar in the wrapping div.
  39                                      **/
  40     biggestFrame:            false,
  41     flags:                   0x1 | 0x2 | 0x4,   /* Start with all flags set. */
  42
  43     /* constants */
  44     FLAG_STRIP_UNLIKELYS:     0x1,
  45     FLAG_WEIGHT_CLASSES:      0x2,
  46     FLAG_CLEAN_CONDITIONALLY: 0x4,
  47
  48     maxPages:    30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
  49     parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
  50     pageETags:   {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
  51
  52     /**
  53      * All of the regular expressions in use within readability.
  54      * Defined up here so we don't instantiate them repeatedly in loops.
  55      **/
  56     regexps: {
  57         unlikelyCandidates:    /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
  58         okMaybeItsACandidate:  /and|article|body|column|main|shadow/i,
  59         positive:              /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
  60         negative:              /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
  61         extraneous:            /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
  62         divToPElements:        /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
  63         replaceBrs:            /(<br[^>]*>[ \n\r\t]*){2,}/gi,
  64         replaceFonts:          /<(\/?)font[^>]*>/gi,
  65         trim:                  /^\s+|\s+$/g,
  66         normalize:             /\s{2,}/g,
  67         killBreaks:            /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
  68         videos:                /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
  69         skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
  70         nextLink:              /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
  71         prevLink:              /(prev|earl|old|new|<|«)/i
  72     },
  73
  74     /**
  75      * Runs readability.
  76      *
  77      * Workflow:
  78      *  1. Prep the document by removing script tags, css, etc.
  79      *  2. Build readability's DOM tree.
  80      *  3. Grab the article content from the current dom tree.
  81      *  4. Replace the current DOM tree with the new one.
  82      *  5. Read peacefully.
  83      *
  84      * @return void
  85      **/
  86     init: function() {
  87         /* Before we do anything, remove all scripts that are not readability. */
  88         window.onload = window.onunload = function() {};
  89
  90         readability.removeScripts(document);
  91
  92         /* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
  93         readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
  94
  95         /* Pull out any possible next page link first */
  96         readability.nextPageLink = readability.findNextPageLink(document.body);
  97
  98         /* We handle processing of nextPage from C++ set nextPageLink to null */
  99         var nextPageLink = null;
 100
 101         readability.prepDocument();
 102
 103         /* Build readability's DOM tree */
 104         var overlay        = document.createElement("DIV");
 105         var innerDiv       = document.createElement("DIV");
 106         var articleTools   = readability.getArticleTools();
 107         var articleTitleText   = readability.getArticleTitle();
 108         var articleContent = readability.grabArticle();
 109
 110         if(!articleContent) {
 111             articleContent    = document.createElement("DIV");
 112             articleContent.id = "readability-content";
 113             articleContent.innerHTML = [
 114                 "<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>",
 115                 (readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames." : ""),
 116                 "<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>"
 117             ].join('');
 118
 119             nextPageLink = null;
 120         }
 121
 122         overlay.id              = "readOverlay";
 123         innerDiv.id             = "readInner";
 124
 125         /* Apply user-selected styling */
 126         document.body.className = readability.readStyle;
 127         document.dir            = readability.getSuggestedDirection(articleTitleText);
 128
 129         if (readability.readStyle === "style-athelas" || readability.readStyle === "style-apertura"){
 130             overlay.className = readability.readStyle + " rdbTypekit";
 131         } else {
 132             overlay.className = readability.readStyle;
 133         }
 134         innerDiv.className    = readability.readMargin + " " + readability.readSize;
 135
 136         if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) {
 137             readability.convertLinksToFootnotes = true;
 138         }
 139
 140         readability.distilledHTML = articleContent.innerHTML;
 141
 142         if(readability.frameHack) {
 143             var readOverlay = document.getElementById('readOverlay');
 144             readOverlay.style.height = '100%';
 145             readOverlay.style.overflow = 'auto';
 146         }
 147
 148         /**
 149          * If someone tries to use Readability on a site's root page, give them a warning about usage.
 150         **/
 151         if((window.location.protocol + "//" + window.location.host + "/") === window.location.href) {
 152             articleContent.style.display = "none";
 153             var rootWarning = document.createElement('p');
 154                 rootWarning.id = "readability-warning";
 155                 rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
 156                 "If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
 157
 158             innerDiv.insertBefore( rootWarning, articleContent );
 159         }
 160
 161         readability.postProcessContent(articleContent);
 162
 163         window.scrollTo(0, 0);
 164
 165         if (nextPageLink) {
 166             /**
 167              * Append any additional pages after a small timeout so that people
 168              * can start reading without having to wait for this to finish processing.
 169             **/
 170             window.setTimeout(function() {
 171                 readability.appendNextPage(nextPageLink);
 172             }, 500);
 173         }
 174
 175         /** Smooth scrolling **/
 176         document.onkeydown = function(e) {
 177             var code = (window.event) ? event.keyCode : e.keyCode;
 178             if (code === 16) {
 179                 readability.reversePageScroll = true;
 180                 return;
 181             }
 182
 183             if (code === 32) {
 184                 readability.curScrollStep = 0;
 185                 var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
 186
 187                 if(readability.reversePageScroll) {
 188                     readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
 189                 }
 190                 else {
 191                     readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
 192                 }
 193
 194                 return false;
 195             }
 196         };
 197
 198         document.onkeyup = function(e) {
 199             var code = (window.event) ? event.keyCode : e.keyCode;
 200             if (code === 16) {
 201                 readability.reversePageScroll = false;
 202                 return;
 203             }
 204         };
 205     },
 206
 207     /**
 208      * Run any post-process modifications to article content as necessary.
 209      *
 210      * @param Element
 211      * @return void
 212     **/
 213     postProcessContent: function(articleContent) {
 214         if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) {
 215             readability.addFootnotes(articleContent);
 216         }
 217
 218         readability.fixImageFloats(articleContent);
 219     },
 220
 221     /**
 222      * Some content ends up looking ugly if the image is too large to be floated.
 223      * If the image is wider than a threshold (currently 55%), no longer float it,
 224      * center it instead.
 225      *
 226      * @param Element
 227      * @return void
 228     **/
 229     fixImageFloats: function (articleContent) {
 230         var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55,
 231             images              = articleContent.getElementsByTagName('img');
 232
 233         for(var i=0, il = images.length; i < il; i+=1) {
 234             var image = images[i];
 235
 236             if(image.offsetWidth > imageWidthThreshold) {
 237                 image.className += " blockImage";
 238             }
 239         }
 240     },
 241
 242     /**
 243      * Get the article tools Element that has buttons like reload, print.
 244      *
 245      * @return void
 246      **/
 247     getArticleTools: function () {
 248         var articleTools = document.createElement("DIV");
 249
 250         articleTools.id        = "readTools";
 251         articleTools.innerHTML =
 252             "<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
 253             "<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
 254             "<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
 255
 256         return articleTools;
 257     },
 258
 259     /**
 260      * retuns the suggested direction of the string
 261      *
 262      * @return "rtl" || "ltr"
 263      **/
 264     getSuggestedDirection: function(text) {
 265         function sanitizeText() {
 266             return text.replace(/@\w+/, "");
 267         }
 268
 269         function countMatches(match) {
 270             var matches = text.match(new RegExp(match, "g"));
 271             return matches !== null ? matches.length : 0;
 272         }
 273
 274         function isRTL() {
 275             var count_heb =  countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
 276             var count_arb =  countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
 277
 278             // if 20% of chars are Hebrew or Arbic then direction is rtl
 279             return  (count_heb + count_arb) * 100 / text.length > 20;
 280         }
 281
 282         text  = sanitizeText(text);
 283         return isRTL() ? "rtl" : "ltr";
 284     },
 285
 286     /**
 287      * Get the article title as an H1.
 288      *
 289      * @return void
 290      **/
 291     getArticleTitle: function () {
 292         var curTitle = "",
 293             origTitle = "";
 294
 295         try {
 296             curTitle = origTitle = document.title;
 297             if(typeof curTitle !== "string") { /* If they had an element with id "title" in their HTML */
 298                 curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
 299             }
 300         }
 301         catch(e) {}
 302
 303         if(curTitle.match(/ [\|\-] /))
 304         {
 305             curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
 306
 307             if(curTitle.split(' ').length < 3) {
 308                 curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
 309             }
 310         }
 311         else if(curTitle.indexOf(': ') !== -1)
 312         {
 313             curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
 314
 315             if(curTitle.split(' ').length < 3) {
 316                 curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
 317             }
 318         }
 319         else if(curTitle.length > 150 || curTitle.length < 15)
 320         {
 321             var hOnes = document.getElementsByTagName('h1');
 322             if(hOnes.length === 1)
 323             {
 324                 curTitle = readability.getInnerText(hOnes[0]);
 325             }
 326         }
 327
 328         curTitle = curTitle.replace( readability.regexps.trim, "" );
 329
 330         if(curTitle.split(' ').length <= 4) {
 331             curTitle = origTitle;
 332         }
 333         return curTitle;
 334     },
 335
 336     /**
 337      * Prepare the HTML document for readability to scrape it.
 338      * This includes things like stripping javascript, CSS, and handling terrible markup.
 339      *
 340      * @return void
 341      **/
 342     prepDocument: function () {
 343         /**
 344          * In some cases a body element can't be found (if the HTML is totally hosed for example)
 345          * so we create a new body node and append it to the document.
 346          */
 347         if(document.body === null)
 348         {
 349             var body = document.createElement("body");
 350             try {
 351                 document.body = body;
 352             }
 353             catch(e) {
 354                 document.documentElement.appendChild(body);
 355                 dbg(e);
 356             }
 357         }
 358
 359         document.body.id = "readabilityBody";
 360
 361         var frames = document.getElementsByTagName('frame');
 362         if(frames.length > 0)
 363         {
 364             var bestFrame = null;
 365             var bestFrameSize = 0;    /* The frame to try to run readability upon. Must be on same domain. */
 366             var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */
 367             for(var frameIndex = 0; frameIndex < frames.length; frameIndex+=1)
 368             {
 369                 var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
 370                 var canAccessFrame = false;
 371                 try {
 372                     var frameBody = frames[frameIndex].contentWindow.document.body;
 373                     canAccessFrame = true;
 374                 }
 375                 catch(eFrames) {
 376                     dbg(eFrames);
 377                 }
 378
 379                 if(frameSize > biggestFrameSize) {
 380                     biggestFrameSize         = frameSize;
 381                     readability.biggestFrame = frames[frameIndex];
 382                 }
 383
 384                 if(canAccessFrame && frameSize > bestFrameSize)
 385                 {
 386                     readability.frameHack = true;
 387
 388                     bestFrame = frames[frameIndex];
 389                     bestFrameSize = frameSize;
 390                 }
 391             }
 392
 393             if(bestFrame)
 394             {
 395                 var newBody = document.createElement('body');
 396                 readability.moveNodeInnards(bestFrame.contentWindow.document.body, newBody);
 397                 newBody.style.overflow = 'scroll';
 398                 document.body = newBody;
 399
 400                 var frameset = document.getElementsByTagName('frameset')[0];
 401                 if(frameset) {
 402                     frameset.parentNode.removeChild(frameset); }
 403             }
 404         }
 405
 406         /* Remove all stylesheets */
 407         for (var k=0;k < document.styleSheets.length; k+=1) {
 408             if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") === -1) {
 409                 document.styleSheets[k].disabled = true;
 410             }
 411         }
 412
 413         /* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
 414         var styleTags = document.getElementsByTagName("style");
 415         for (var st=0;st < styleTags.length; st+=1) {
 416             styleTags[st].textContent = "";
 417         }
 418
 419         /* Turn all double br's into p's */
 420         /* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
 421         readability.replaceDoubleBrsWithPs(document.body);
 422         readability.replaceFontsWithSpans(document.body);
 423     },
 424
 425
 426     /**
 427      * Prepare the article node for display. Clean out any inline styles,
 428      * iframes, forms, strip extraneous <p> tags, etc.
 429      *
 430      * @param Element
 431      * @return void
 432      **/
 433     prepArticle: function (articleContent) {
 434         readability.cleanStyles(articleContent);
 435         readability.killBreaks(articleContent);
 436
 437         /* Clean out junk from the article content */
 438         readability.cleanConditionally(articleContent, "form");
 439         readability.clean(articleContent, "object");
 440         readability.clean(articleContent, "h1");
 441
 442         /**
 443          * If there is only one h2, they are probably using it
 444          * as a header and not a subheader, so remove it since we already have a header.
 445         ***/
 446         if(articleContent.getElementsByTagName('h2').length === 1) {
 447             readability.clean(articleContent, "h2");
 448         }
 449         readability.clean(articleContent, "iframe");
 450
 451         readability.cleanHeaders(articleContent);
 452
 453         /* Do these last as the previous stuff may have removed junk that will affect these */
 454         readability.cleanConditionally(articleContent, "table");
 455         readability.cleanConditionally(articleContent, "ul");
 456         readability.cleanConditionally(articleContent, "div");
 457
 458         /* Remove extra paragraphs */
 459         var articleParagraphs = articleContent.getElementsByTagName('p');
 460         for(var i = articleParagraphs.length-1; i >= 0; i-=1) {
 461             var imgCount    = articleParagraphs[i].getElementsByTagName('img').length;
 462             var embedCount  = articleParagraphs[i].getElementsByTagName('embed').length;
 463             var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
 464
 465             if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) === '') {
 466                 articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
 467             }
 468         }
 469
 470         try {
 471             readability.replaceBrsWithPs(articleContent);
 472         }
 473         catch (e) {
 474             dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
 475         }
 476     },
 477
 478     /**
 479      * Initialize a node with the readability object. Also checks the
 480      * className/id for special names to add to its score.
 481      *
 482      * @param Element
 483      * @return void
 484     **/
 485     initializeNode: function (node) {
 486         node.readability = {"contentScore": 0};
 487
 488         switch(node.tagName) {
 489             case 'DIV':
 490                 node.readability.contentScore += 5;
 491                 break;
 492
 493             case 'PRE':
 494             case 'TD':
 495             case 'BLOCKQUOTE':
 496                 node.readability.contentScore += 3;
 497                 break;
 498
 499             case 'ADDRESS':
 500             case 'OL':
 501             case 'UL':
 502             case 'DL':
 503             case 'DD':
 504             case 'DT':
 505             case 'LI':
 506             case 'FORM':
 507                 node.readability.contentScore -= 3;
 508                 break;
 509
 510             case 'H1':
 511             case 'H2':
 512             case 'H3':
 513             case 'H4':
 514             case 'H5':
 515             case 'H6':
 516             case 'TH':
 517                 node.readability.contentScore -= 5;
 518                 break;
 519         }
 520
 521         node.readability.contentScore += readability.getClassWeight(node);
 522     },
 523
 524     /***
 525      * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
 526      *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
 527      *
 528      * @param page a document to run upon. Needs to be a full document, complete with body.
 529      * @return Element
 530     **/
 531     grabArticle: function (pageToClone) {
 532         var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
 533             isPaging = (page !== null) ? true: false;
 534
 535         var page = null;
 536         // Never work on the actual page.
 537         if (isPaging) {
 538             page = document.body.cloneNode(true);
 539         } else {
 540             page = pageToClone.cloneNode(true);
 541         }
 542
 543         var allElements = page.getElementsByTagName('*');
 544
 545         /**
 546          * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
 547          * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
 548          *
 549          * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
 550          * TODO: Shouldn't this be a reverse traversal?
 551         **/
 552         var node = null;
 553         var nodesToScore = [];
 554         for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
 555             /* Remove unlikely candidates */
 556             if (stripUnlikelyCandidates) {
 557                 var unlikelyMatchString = node.className + node.id;
 558                 if (
 559                     (
 560                         unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
 561                         unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
 562                         node.tagName !== "BODY"
 563                     )
 564                 )
 565                 {
 566                     dbg("Removing unlikely candidate - " + unlikelyMatchString);
 567                     node.parentNode.removeChild(node);
 568                     nodeIndex-=1;
 569                     continue;
 570                 }
 571             }
 572
 573             if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
 574                 nodesToScore[nodesToScore.length] = node;
 575             }
 576
 577             /* Turn all divs that don't have children block level elements into p's */
 578             if (node.tagName === "DIV") {
 579                 if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
 580                     var newNode = document.createElement('p');
 581                     try {
 582                         readability.moveNodeInnards(node, newNode);
 583                         node.parentNode.replaceChild(newNode, node);
 584                         nodeIndex-=1;
 585
 586                         nodesToScore[nodesToScore.length] = node;
 587                     }
 588                     catch(e) {
 589                         dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
 590                     }
 591                 }
 592                 else
 593                 {
 594                     /* EXPERIMENTAL */
 595                     for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
 596                         var childNode = node.childNodes[i];
 597                         if(childNode.nodeType === 3) { // Node.TEXT_NODE
 598                             var p = document.createElement('p');
 599                             var t = document.createTextNode(childNode.nodeValue);
 600                             p.appendChild(t);
 601                             p.style.display = 'inline';
 602                             p.className = 'readability-styled';
 603                             childNode.parentNode.replaceChild(p, childNode);
 604                         }
 605                     }
 606                 }
 607             }
 608         }
 609
 610         /**
 611          * Loop through all paragraphs, and assign a score to them based on how content-y they look.
 612          * Then add their score to their parent node.
 613          *
 614          * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
 615         **/
 616         var candidates = [];
 617         for (var pt=0; pt < nodesToScore.length; pt+=1) {
 618             var parentNode      = nodesToScore[pt].parentNode;
 619             var grandParentNode = parentNode ? parentNode.parentNode : null;
 620             var innerText       = readability.getInnerText(nodesToScore[pt]);
 621
 622             if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
 623                 continue;
 624             }
 625
 626             /* If this paragraph is less than 25 characters, don't even count it. */
 627             if(innerText.length < 25) {
 628                 continue; }
 629
 630             /* Initialize readability data for the parent. */
 631             if(typeof parentNode.readability === 'undefined') {
 632                 readability.initializeNode(parentNode);
 633                 candidates.push(parentNode);
 634             }
 635
 636             /* Initialize readability data for the grandparent. */
 637             if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
 638                 readability.initializeNode(grandParentNode);
 639                 candidates.push(grandParentNode);
 640             }
 641
 642             var contentScore = 0;
 643
 644             /* Add a point for the paragraph itself as a base. */
 645             contentScore+=1;
 646
 647             /* Add points for any commas within this paragraph */
 648             contentScore += innerText.split(',').length;
 649
 650             /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
 651             contentScore += Math.min(Math.floor(innerText.length / 100), 3);
 652
 653             /* Add the score to the parent. The grandparent gets half. */
 654             parentNode.readability.contentScore += contentScore;
 655
 656             if(grandParentNode) {
 657                 grandParentNode.readability.contentScore += contentScore/2;
 658             }
 659         }
 660
 661         /**
 662          * After we've calculated scores, loop through all of the possible candidate nodes we found
 663          * and find the one with the highest score.
 664         **/
 665         var topCandidate = null;
 666         for(var c=0, cl=candidates.length; c < cl; c+=1)
 667         {
 668             /**
 669              * Scale the final candidates score based on link density. Good content should have a
 670              * relatively small link density (5% or less) and be mostly unaffected by this operation.
 671             **/
 672             candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
 673
 674             dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
 675
 676             if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
 677                 topCandidate = candidates[c]; }
 678         }
 679
 680         /**
 681          * If we still have no top candidate, just use the body as a last resort.
 682          * We also have to copy the body node so it is something we can modify.
 683          **/
 684         if (topCandidate === null || topCandidate.tagName === "BODY")
 685         {
 686             topCandidate = document.createElement("DIV");
 687             readability.replaceNodeInnards(page, topCandidate);
 688             page.appendChild(topCandidate);
 689             readability.initializeNode(topCandidate);
 690         }
 691
 692         /**
 693          * Now that we have the top candidate, look through its siblings for content that might also be related.
 694          * Things like preambles, content split by ads that we removed, etc.
 695         **/
 696         var articleContent        = document.createElement("DIV");
 697         if (isPaging) {
 698             articleContent.id     = "readability-content";
 699         }
 700         var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
 701         var siblingNodes          = topCandidate.parentNode.childNodes;
 702
 703
 704         for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
 705             var siblingNode = siblingNodes[s];
 706             var append      = false;
 707
 708             /**
 709              * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
 710              * Example of error visible here: http://www.esquire.com/features/honesty0707
 711             **/
 712             if(!siblingNode) {
 713                 continue;
 714             }
 715
 716             dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
 717             dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
 718
 719             if(siblingNode === topCandidate)
 720             {
 721                 append = true;
 722             }
 723
 724             var contentBonus = 0;
 725             /* Give a bonus if sibling nodes and top candidates have the example same classname */
 726             if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
 727                 contentBonus += topCandidate.readability.contentScore * 0.2;
 728             }
 729
 730             if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
 731             {
 732                 append = true;
 733             }
 734
 735             if(siblingNode.nodeName === "P") {
 736                 var linkDensity = readability.getLinkDensity(siblingNode);
 737                 var nodeContent = readability.getInnerText(siblingNode);
 738                 var nodeLength  = nodeContent.length;
 739
 740                 if(nodeLength > 80 && linkDensity < 0.25)
 741                 {
 742                     append = true;
 743                 }
 744                 else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
 745                 {
 746                     append = true;
 747                 }
 748             }
 749
 750             if(append) {
 751                 dbg("Appending node: " + siblingNode);
 752
 753                 var nodeToAppend = null;
 754                 if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
 755                     /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
 756
 757                     dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
 758                     nodeToAppend = document.createElement("DIV");
 759                     try {
 760                         nodeToAppend.id = siblingNode.id;
 761                         readability.moveNodeInnards(siblingNode, nodeToAppend);
 762                     }
 763                     catch(er) {
 764                         dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
 765                         nodeToAppend = siblingNode;
 766                         s-=1;
 767                         sl-=1;
 768                     }
 769                 } else {
 770                     nodeToAppend = siblingNode;
 771                     s-=1;
 772                     sl-=1;
 773                 }
 774
 775                 /* To ensure a node does not interfere with readability styles, remove its classnames */
 776                 nodeToAppend.className = "";
 777
 778                 /* Append sibling and subtract from our list because it removes the node when you append to another node */
 779                 articleContent.appendChild(nodeToAppend);
 780             }
 781         }
 782
 783         /**
 784          * So we have all of the content that we need. Now we clean it up for presentation.
 785         **/
 786         readability.distilledArticleContent = articleContent.cloneNode(true);
 787         //readability.prepArticle(articleContent);
 788
 789         if (readability.curPageNum === 1) {
 790             var newNode = document.createElement('div');
 791             newNode.id = "readability-page-1";
 792             newNode.setAttribute("class", "page");
 793             readability.moveNodeInnards(articleContent, newNode);
 794             articleContent.appendChild(newNode);
 795         }
 796
 797         /**
 798          * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
 799          * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
 800          * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
 801          * finding the -right- content.
 802         **/
 803         if(readability.getInnerText(articleContent, false).length < 250) {
 804             if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
 805                 readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
 806                 return readability.grabArticle(document.body);
 807             }
 808             else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
 809                 readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
 810                 return readability.grabArticle(document.body);
 811             }
 812             else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
 813                 readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
 814                 return readability.grabArticle(document.body);
 815             } else {
 816                 return null;
 817             }
 818         }
 819
 820         return articleContent;
 821     },
 822
 823     /**
 824      * Removes script tags from the document.
 825      *
 826      * @param Element
 827     **/
 828     removeScripts: function (doc) {
 829         var scripts = doc.getElementsByTagName('script');
 830         for(var i = scripts.length-1; i >= 0; i-=1)
 831         {
 832             if(typeof(scripts[i].src) === "undefined" || (scripts[i].src.indexOf('readability') === -1 && scripts[i].src.indexOf('typekit') === -1))
 833             {
 834                 scripts[i].nodeValue="";
 835                 scripts[i].removeAttribute('src');
 836                 if (scripts[i].parentNode) {
 837                         scripts[i].parentNode.removeChild(scripts[i]);
 838                 }
 839             }
 840         }
 841     },
 842
 843     /**
 844      * Get the inner text of a node - cross browser compatibly.
 845      * This also strips out any excess whitespace to be found.
 846      *
 847      * @param Element
 848      * @return string
 849     **/
 850     getInnerText: function (e, normalizeSpaces) {
 851         var textContent    = "";
 852
 853         if(typeof(e.textContent) === "undefined" && typeof(e.innerText) === "undefined") {
 854             return "";
 855         }
 856
 857         normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
 858
 859         if (navigator.appName === "Microsoft Internet Explorer") {
 860             textContent = e.innerText.replace( readability.regexps.trim, "" ); }
 861         else {
 862             textContent = e.textContent.replace( readability.regexps.trim, "" ); }
 863
 864         if(normalizeSpaces) {
 865             return textContent.replace( readability.regexps.normalize, " "); }
 866         else {
 867             return textContent; }
 868     },
 869
 870     /**
 871      * Get the number of times a string s appears in the node e.
 872      *
 873      * @param Element
 874      * @param string - what to split on. Default is ","
 875      * @return number (integer)
 876     **/
 877     getCharCount: function (e,s) {
 878         s = s || ",";
 879         return readability.getInnerText(e).split(s).length-1;
 880     },
 881
 882     /**
 883      * Remove the style attribute on every e and under.
 884      * TODO: Test if getElementsByTagName(*) is faster.
 885      *
 886      * @param Element
 887      * @return void
 888     **/
 889     cleanStyles: function (e) {
 890         e = e || document;
 891         var cur = e.firstChild;
 892
 893         if(!e) {
 894             return; }
 895
 896         // Remove any root styles, if we're able.
 897         if(typeof e.removeAttribute === 'function' && e.className !== 'readability-styled') {
 898             e.removeAttribute('style'); }
 899
 900         // Go until there are no more child nodes
 901         while ( cur !== null ) {
 902             if ( cur.nodeType === 1 ) {
 903                 // Remove style attribute(s) :
 904                 if(cur.className !== "readability-styled") {
 905                     cur.removeAttribute("style");
 906                 }
 907                 readability.cleanStyles( cur );
 908             }
 909             cur = cur.nextSibling;
 910         }
 911     },
 912
 913     /**
 914      * Get the density of links as a percentage of the content
 915      * This is the amount of text that is inside a link divided by the total text in the node.
 916      *
 917      * @param Element
 918      * @return number (float)
 919     **/
 920     getLinkDensity: function (e) {
 921         var links      = e.getElementsByTagName("a");
 922         var textLength = readability.getInnerText(e).length;
 923         var linkLength = 0;
 924         for(var i=0, il=links.length; i<il;i+=1)
 925         {
 926             linkLength += readability.getInnerText(links[i]).length;
 927         }
 928
 929         return linkLength / textLength;
 930     },
 931
 932     /**
 933      * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
 934      *
 935      * @author Dan Lacy
 936      * @return string the base url
 937     **/
 938     findBaseUrl: function () {
 939         var noUrlParams     = window.location.pathname.split("?")[0],
 940             urlSlashes      = noUrlParams.split("/").reverse(),
 941             cleanedSegments = [],
 942             possibleType    = "";
 943
 944         for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i+=1) {
 945             var segment = urlSlashes[i];
 946
 947             // Split off and save anything that looks like a file type.
 948             if (segment.indexOf(".") !== -1) {
 949                 possibleType = segment.split(".")[1];
 950
 951                 /* If the type isn't alpha-only, it's probably not actually a file extension. */
 952                 if(!possibleType.match(/[^a-zA-Z]/)) {
 953                     segment = segment.split(".")[0];
 954                 }
 955             }
 956
 957             /**
 958              * EW-CMS specific segment replacement. Ugly.
 959              * Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
 960             **/
 961             if(segment.indexOf(',00') !== -1) {
 962                 segment = segment.replace(',00', '');
 963             }
 964
 965             // If our first or second segment has anything looking like a page number, remove it.
 966             if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
 967                 segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
 968             }
 969
 970
 971             var del = false;
 972
 973             /* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
 974             if (i < 2 && segment.match(/^\d{1,2}$/)) {
 975                 del = true;
 976             }
 977
 978             /* If this is the first segment and it's just "index", remove it. */
 979             if(i === 0 && segment.toLowerCase() === "index") {
 980                 del = true;
 981             }
 982
 983
 984             /* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
 985             if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i)) {
 986                 del = true;
 987             }
 988
 989             /* If it's not marked for deletion, push it to cleanedSegments. */
 990             if (!del) {
 991                 cleanedSegments.push(segment);
 992             }
 993         }
 994
 995         // This is our final, cleaned, base article URL.
 996         return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/");
 997     },
 998
 999     /**
1000      * Look for any paging links that may occur within the document.
1001      *
1002      * @param body
1003      * @return object (array)
1004     **/
1005     findNextPageLink: function (elem) {
1006         var possiblePages = {},
1007             allLinks = elem.getElementsByTagName('a'),
1008             articleBaseUrl = readability.findBaseUrl();
1009
1010         /**
1011          * Loop through all links, looking for hints that they may be next-page links.
1012          * Things like having "page" in their textContent, className or id, or being a child
1013          * of a node with a page-y className or id.
1014          *
1015          * Also possible: levenshtein distance? longest common subsequence?
1016          *
1017          * After we do that, assign each page a score, and
1018         **/
1019         for(var i = 0, il = allLinks.length; i < il; i+=1) {
1020             var link     = allLinks[i],
1021                 linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
1022
1023             /* If we've already seen this page, ignore it */
1024             if(linkHref === "" || linkHref === articleBaseUrl || linkHref === window.location.href || linkHref in readability.parsedPages) {
1025                 continue;
1026             }
1027
1028             /* If it's on a different domain, skip it. */
1029             if(window.location.host !== linkHref.split(/\/+/g)[1]) {
1030                 continue;
1031             }
1032
1033             var linkText = readability.getInnerText(link);
1034
1035             /* If the linkText looks like it's not the next page, skip it. */
1036             if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
1037                 continue;
1038             }
1039
1040             /* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
1041             var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
1042             if(!linkHrefLeftover.match(/\d/)) {
1043                 continue;
1044             }
1045
1046             if(!(linkHref in possiblePages)) {
1047                 possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
1048             } else {
1049                 possiblePages[linkHref].linkText += ' | ' + linkText;
1050             }
1051
1052             var linkObj = possiblePages[linkHref];
1053
1054             /**
1055              * If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
1056              * Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
1057             **/
1058             if(linkHref.indexOf(articleBaseUrl) !== 0) {
1059                 linkObj.score -= 25;
1060             }
1061
1062             var linkData = linkText + ' ' + link.className + ' ' + link.id;
1063             if(linkData.match(readability.regexps.nextLink)) {
1064                 linkObj.score += 50;
1065             }
1066             if(linkData.match(/pag(e|ing|inat)/i)) {
1067                 linkObj.score += 25;
1068             }
1069             if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
1070                 /* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
1071                 if(!linkObj.linkText.match(readability.regexps.nextLink)) {
1072                     linkObj.score -= 65;
1073                 }
1074             }
1075             if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) {
1076                 linkObj.score -= 50;
1077             }
1078             if(linkData.match(readability.regexps.prevLink)) {
1079                 linkObj.score -= 200;
1080             }
1081
1082             /* If a parentNode contains page or paging or paginat */
1083             var parentNode = link.parentNode,
1084                 positiveNodeMatch = false,
1085                 negativeNodeMatch = false;
1086             while(parentNode) {
1087                 var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
1088                 if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
1089                     positiveNodeMatch = true;
1090                     linkObj.score += 25;
1091                 }
1092                 if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) {
1093                     /* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
1094                     if(!parentNodeClassAndId.match(readability.regexps.positive)) {
1095                         linkObj.score -= 25;
1096                         negativeNodeMatch = true;
1097                     }
1098                 }
1099
1100                 parentNode = parentNode.parentNode;
1101             }
1102
1103             /**
1104              * If the URL looks like it has paging in it, add to the score.
1105              * Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
1106             **/
1107             if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
1108                 linkObj.score += 25;
1109             }
1110
1111             /* If the URL contains negative values, give a slight decrease. */
1112             if (linkHref.match(readability.regexps.extraneous)) {
1113                 linkObj.score -= 15;
1114             }
1115
1116             /**
1117              * Minor punishment to anything that doesn't match our current URL.
1118              * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
1119              *       Dan, can you show me a counterexample where this is necessary?
1120              * if (linkHref.indexOf(window.location.href) !== 0) {
1121              *    linkObj.score -= 1;
1122              * }
1123             **/
1124
1125             /**
1126              * If the link text can be parsed as a number, give it a minor bonus, with a slight
1127              * bias towards lower numbered pages. This is so that pages that might not have 'next'
1128              * in their text can still get scored, and sorted properly by score.
1129             **/
1130             var linkTextAsNumber = parseInt(linkText, 10);
1131             if(linkTextAsNumber) {
1132                 // Punish 1 since we're either already there, or it's probably before what we want anyways.
1133                 if (linkTextAsNumber === 1) {
1134                     linkObj.score -= 10;
1135                 }
1136                 else {
1137                     // Todo: Describe this better
1138                     linkObj.score += Math.max(0, 10 - linkTextAsNumber);
1139                 }
1140             }
1141         }
1142
1143         /**
1144          * Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
1145          * Require at least a score of 50, which is a relatively high confidence that this page is the next link.
1146         **/
1147         var topPage = null;
1148         for(var page in possiblePages) {
1149             if(possiblePages.hasOwnProperty(page)) {
1150                 if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) {
1151                     topPage = possiblePages[page];
1152                 }
1153             }
1154         }
1155
1156         if(topPage) {
1157             var nextHref = topPage.href.replace(/\/$/,'');
1158
1159             dbg('NEXT PAGE IS ' + nextHref);
1160             readability.parsedPages[nextHref] = true;
1161             return nextHref;
1162         }
1163         else {
1164             return null;
1165         }
1166     },
1167
1168     createLinkDiv: function(link) {
1169         var divNode = document.createElement('div');
1170         var aNode = document.createElement('a');
1171         var tNode = document.createTextNode('View Next Page');
1172         divNode.setAttribute('style', 'text-align: center');
1173         aNode.setAttribute('href', link);
1174         aNode.appendChild(tNode);
1175         divNode.appendChild(aNode);
1176         return divNode;
1177     },
1178
1179     xhr: function () {
1180         if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
1181             return new XMLHttpRequest();
1182         }
1183         else {
1184             try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
1185             try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
1186             try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
1187         }
1188
1189         return false;
1190     },
1191
1192     successfulRequest: function (request) {
1193         return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText);
1194     },
1195
1196     ajax: function (url, options) {
1197         var request = readability.xhr();
1198
1199         function respondToReadyState(readyState) {
1200             if (request.readyState === 4) {
1201                 if (readability.successfulRequest(request)) {
1202                     if (options.success) { options.success(request); }
1203                 }
1204                 else {
1205                     if (options.error) { options.error(request); }
1206                 }
1207             }
1208         }
1209
1210         if (typeof options === 'undefined') { options = {}; }
1211
1212         request.onreadystatechange = respondToReadyState;
1213
1214         request.open('get', url, true);
1215         request.setRequestHeader('Accept', 'text/html');
1216
1217         try {
1218             request.send(options.postBody);
1219         }
1220         catch (e) {
1221             if (options.error) { options.error(); }
1222         }
1223
1224         return request;
1225     },
1226
1227     /**
1228      * Make an AJAX request for each page and append it to the document.
1229     **/
1230     curPageNum: 1,
1231
1232     appendNextPage: function (nextPageLink) {
1233         readability.curPageNum+=1;
1234
1235         var articlePage       = document.createElement("DIV");
1236         articlePage.id        = 'readability-page-' + readability.curPageNum;
1237         articlePage.className = 'page';
1238         articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">&sect;</p>';
1239
1240         document.getElementById("readability-content").appendChild(articlePage);
1241
1242         if(readability.curPageNum > readability.maxPages) {
1243             var linkDiv = readability.createLinkDiv(nextPageLink);
1244
1245             articlePage.appendChild(linkDiv);
1246             return;
1247         }
1248
1249         /**
1250          * Now that we've built the article page DOM element, get the page content
1251          * asynchronously and load the cleaned content into the div we created for it.
1252         **/
1253         (function(pageUrl, thisPage) {
1254             readability.ajax(pageUrl, {
1255                 success: function(r) {
1256
1257                     /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
1258                     var eTag = r.getResponseHeader('ETag');
1259                     if(eTag) {
1260                         if(eTag in readability.pageETags) {
1261                             dbg("Exact duplicate page found via ETag. Aborting.");
1262                             articlePage.style.display = 'none';
1263                             return;
1264                         } else {
1265                             readability.pageETags[eTag] = 1;
1266                         }
1267                     }
1268
1269                     // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
1270                     var page = document.createElement("DIV");
1271
1272                     /**
1273                      * Do some preprocessing to our HTML to make it ready for appending.
1274                      * • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
1275                      * • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript.
1276                      * • Turn all double br's into p's - was handled by prepDocument in the original view.
1277                      *   Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages.
1278                     **/
1279                     var pageInnards = r.responseXML;
1280                     readability.removeScripts(pageInnards);
1281                     readability.replaceNoscriptsWithPs(pageInnards);
1282                     readability.replaceDoubleBrsWithPs(pageInnards);
1283                     readability.replaceFontsWithSpans(pageInnards);
1284                     page.appendChild(pageInnards);
1285
1286
1287                     /**
1288                      * Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
1289                     **/
1290                     readability.flags = 0x1 | 0x2 | 0x4;
1291
1292                     var nextPageLink = readability.findNextPageLink(page),
1293                         content      =  readability.grabArticle(page);
1294
1295                     if(!content) {
1296                         dbg("No content found in page to append. Aborting.");
1297                         return;
1298                     }
1299
1300                     /**
1301                      * Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
1302                      * Compare it against all of the the previous document's we've gotten. If the previous
1303                      * document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
1304                     **/
1305                     var firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
1306                     if(firstP && firstP.innerHTML.length > 100) {
1307                         for(var i=1; i <= readability.curPageNum; i+=1) {
1308                             var rPage = document.getElementById('readability-page-' + i);
1309                             if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
1310                                 dbg('Duplicate of page ' + i + ' - skipping.');
1311                                 articlePage.style.display = 'none';
1312                                 readability.parsedPages[pageUrl] = true;
1313                                 return;
1314                             }
1315                         }
1316                     }
1317
1318                     readability.removeScripts(content);
1319
1320                     readability.moveNodeInnards(content, thisPage);
1321
1322                     /**
1323                      * After the page has rendered, post process the content. This delay is necessary because,
1324                      * in webkit at least, offsetWidth is not set in time to determine image width. We have to
1325                      * wait a little bit for reflow to finish before we can fix floating images.
1326                     **/
1327                     window.setTimeout(
1328                         function() { readability.postProcessContent(thisPage); },
1329                         500
1330                     );
1331
1332                     if(nextPageLink) {
1333                         readability.appendNextPage(nextPageLink);
1334                     }
1335                 }
1336             });
1337         }(nextPageLink, articlePage));
1338     },
1339
1340     /**
1341      * Get an elements class/id weight. Uses regular expressions to tell if this
1342      * element looks good or bad.
1343      *
1344      * @param Element
1345      * @return number (Integer)
1346     **/
1347     getClassWeight: function (e) {
1348         if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
1349             return 0;
1350         }
1351
1352         var weight = 0;
1353
1354         /* Look for a special classname */
1355         if (typeof(e.className) === 'string' && e.className !== '')
1356         {
1357             if(e.className.search(readability.regexps.negative) !== -1) {
1358                 weight -= 25; }
1359
1360             if(e.className.search(readability.regexps.positive) !== -1) {
1361                 weight += 25; }
1362         }
1363
1364         /* Look for a special ID */
1365         if (typeof(e.id) === 'string' && e.id !== '')
1366         {
1367             if(e.id.search(readability.regexps.negative) !== -1) {
1368                 weight -= 25; }
1369
1370             if(e.id.search(readability.regexps.positive) !== -1) {
1371                 weight += 25; }
1372         }
1373
1374         return weight;
1375     },
1376
1377     nodeIsVisible: function (node) {
1378         return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
1379     },
1380
1381     /**
1382      * Remove extraneous break tags from a node.
1383      *
1384      * @param Element
1385      * @return void
1386      **/
1387     killBreaks: function (e) {
1388         var allElements = e.getElementsByTagName('*');
1389         while (i < allElements.length) {
1390             readability.deleteExtraBreaks(allElements[i]);
1391             i++;
1392         }
1393     },
1394
1395     /**
1396      * Clean a node of all elements of type "tag".
1397      * (Unless it's a youtube/vimeo video. People love movies.)
1398      *
1399      * @param Element
1400      * @param string tag to clean
1401      * @return void
1402      **/
1403     clean: function (e, tag) {
1404         var targetList = e.getElementsByTagName( tag );
1405         var isEmbed    = (tag === 'object' || tag === 'embed');
1406
1407         for (var y=targetList.length-1; y >= 0; y-=1) {
1408             /* Allow youtube and vimeo videos through as people usually want to see those. */
1409             if(isEmbed) {
1410                 var attributeValues = "";
1411                 for (var i=0, il=targetList[y].attributes.length; i < il; i+=1) {
1412                     attributeValues += targetList[y].attributes[i].value + '|';
1413                 }
1414
1415                 /* First, check the elements attributes to see if any of them contain youtube or vimeo */
1416                 if (attributeValues.search(readability.regexps.videos) !== -1) {
1417                     continue;
1418                 }
1419
1420                 /* Then check the elements inside this element for the same. */
1421                 if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
1422                     continue;
1423                 }
1424
1425             }
1426
1427             targetList[y].parentNode.removeChild(targetList[y]);
1428         }
1429     },
1430
1431     /**
1432      * Clean an element of all tags of type "tag" if they look fishy.
1433      * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
1434      *
1435      * @return void
1436      **/
1437     cleanConditionally: function (e, tag) {
1438
1439         if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
1440             return;
1441         }
1442
1443         var tagsList      = e.getElementsByTagName(tag);
1444         var curTagsLength = tagsList.length;
1445
1446         /**
1447          * Gather counts for other typical elements embedded within.
1448          * Traverse backwards so we can remove nodes at the same time without effecting the traversal.
1449          *
1450          * TODO: Consider taking into account original contentScore here.
1451         **/
1452         for (var i=curTagsLength-1; i >= 0; i-=1) {
1453             var weight = readability.getClassWeight(tagsList[i]);
1454             var contentScore = (typeof tagsList[i].readability !== 'undefined') ? tagsList[i].readability.contentScore : 0;
1455
1456             dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability !== 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
1457
1458             if(weight+contentScore < 0)
1459             {
1460                 tagsList[i].parentNode.removeChild(tagsList[i]);
1461             }
1462             else if ( readability.getCharCount(tagsList[i],',') < 10) {
1463                 /**
1464                  * If there are not very many commas, and the number of
1465                  * non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
1466                 **/
1467                 var p      = tagsList[i].getElementsByTagName("p").length;
1468                 var img    = tagsList[i].getElementsByTagName("img").length;
1469                 var li     = tagsList[i].getElementsByTagName("li").length-100;
1470                 var input  = tagsList[i].getElementsByTagName("input").length;
1471
1472                 var embedCount = 0;
1473                 var embeds     = tagsList[i].getElementsByTagName("embed");
1474                 for(var ei=0,il=embeds.length; ei < il; ei+=1) {
1475                     if (embeds[ei].src.search(readability.regexps.videos) === -1) {
1476                       embedCount+=1;
1477                     }
1478                 }
1479
1480                 var linkDensity   = readability.getLinkDensity(tagsList[i]);
1481                 var contentLength = readability.getInnerText(tagsList[i]).length;
1482                 var toRemove      = false;
1483
1484                 if ( img > p ) {
1485                     toRemove = true;
1486                 } else if(li > p && tag !== "ul" && tag !== "ol") {
1487                     toRemove = true;
1488                 } else if( input > Math.floor(p/3) ) {
1489                     toRemove = true;
1490                 } else if(contentLength < 25 && (img === 0 || img > 2) ) {
1491                     toRemove = true;
1492                 } else if(weight < 25 && linkDensity > 0.2) {
1493                     toRemove = true;
1494                 } else if(weight >= 25 && linkDensity > 0.5) {
1495                     toRemove = true;
1496                 } else if((embedCount === 1 && contentLength < 75) || embedCount > 1) {
1497                     toRemove = true;
1498                 }
1499
1500                 if(toRemove) {
1501                     tagsList[i].parentNode.removeChild(tagsList[i]);
1502                 }
1503             }
1504         }
1505     },
1506
1507     /**
1508      * Clean out spurious headers from an Element. Checks things like classnames and link density.
1509      *
1510      * @param Element
1511      * @return void
1512     **/
1513     cleanHeaders: function (e) {
1514         for (var headerIndex = 1; headerIndex < 3; headerIndex+=1) {
1515             var headers = e.getElementsByTagName('h' + headerIndex);
1516             for (var i=headers.length-1; i >=0; i-=1) {
1517                 if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
1518                     headers[i].parentNode.removeChild(headers[i]);
1519                 }
1520             }
1521         }
1522     },
1523
1524     flagIsActive: function(flag) {
1525         return (readability.flags & flag) > 0;
1526     },
1527
1528     addFlag: function(flag) {
1529         readability.flags = readability.flags | flag;
1530     },
1531
1532     removeFlag: function(flag) {
1533         readability.flags = readability.flags & ~flag;
1534     },
1535
1536     // Removes the children of |src| and appends them to |dest|.
1537     moveNodeInnards: function(src, dest) {
1538         try {
1539             while (src.firstChild) {
1540                 dest.appendChild(src.removeChild(src.firstChild));
1541             }
1542         } catch (e) {}
1543     },
1544
1545     // Returns true if the node is a whitespace text node.
1546     isWhitespaceNode: function(node) {
1547         if (node.nodeType == Node.TEXT_NODE) {
1548             if (node.data.trim().length == 0) {
1549                return true;
1550             }
1551         }
1552         return false;
1553     },
1554
1555     // Returns true if the node is a <BR>.
1556     isBrNode: function(node) {
1557         return (node.tagName === 'BR');
1558     },
1559
1560
1561     // Returns the last <BR> node in a sequence of <BR> nodes that are only
1562     // separated by whitespace, or null if there are not at least two <BR> tags
1563     // in the sibling chain starting with |node|. Returns the second such <BR>
1564     // node if |restrictToTwo| is true.
1565     isMultipleBr: function(node, restrictToTwo) {
1566         var lastBr = null;
1567         if (!readability.isBrNode(node)) {
1568             return lastBr;
1569         }
1570         var curr = node.nextSibling;
1571         while (curr) {
1572             if (readability.isWhitespaceNode(curr) || readability.isBrNode(curr)) {
1573                 lastBr = curr;
1574                 curr = curr.nextSibling;
1575                 if (restrictToTwo) {
1576                     if (readability.isBrNode(lastBr)) {
1577                         return lastBr;
1578                     }
1579                 }
1580                 continue;
1581             }
1582             break;
1583         }
1584         return lastBr;
1585     },
1586
1587     // Removes all <BR> nodes except one and whitespace in between in a series
1588     // of <BR> nodes.
1589     deleteExtraBreaks: function(node) {
1590         var lastBr = readability.isMultipleBr(node, false);
1591         var ret = false;
1592         while (lastBr && lastBr != node) {
1593             var toRemove = lastBr;
1594             lastBr = lastBr.previousSibling;
1595             toRemove.parentNode.removeChild(toRemove);
1596             ret = true;
1597         }
1598         return ret;
1599     },
1600
1601     // Replaces a pair of <BR> nodes (possibly separated by whitespace), with a
1602     // <P> node, and makes all next siblings of that pair children of <P>, up
1603     // until the next pair of <BR> nodes is reached.
1604     replaceDoubleBrWithP: function(node) {
1605         // Check that we are starting with a BR.
1606         var second = readability.isMultipleBr(node, true);
1607         if (!second) {
1608             return;
1609         }
1610         // Make all next siblings of the second BR into children of a P.
1611         var p = document.createElement('p');
1612         var curr = second.nextSibling;
1613         while (curr) {
1614             if (readability.isMultipleBr(curr, true)) {
1615                 break;
1616             }
1617             var next = curr.nextSibling;
1618             p.appendChild(curr.parentNode.removeChild(curr));
1619             curr = next;
1620         }
1621         var ret = curr;
1622
1623         // Remove all nodes between the first and second BR.
1624         curr = node.nextSibling;
1625         while (curr && curr != second) {
1626             var next = curr.nextSibling;
1627             curr.parentNode.removeChild(curr);
1628             curr = next;
1629         }
1630         // Remove the second BR.
1631         second.parentNode.removeChild(second);
1632         // Replace the first BR with the P.
1633         node.parentNode.replaceChild(p, node);
1634
1635         return ret;
1636     },
1637
1638     // Returns true if the NodeList contains a double <BR>.
1639     hasDoubleBr: function(nodeList) {
1640         for (var i = 0; i < nodeList.length; nodeList++) {
1641             if (readability.isMultipleBr(nodeList[i], true)) {
1642                 return true;
1643             }
1644         }
1645         return false;
1646     },
1647
1648     // Replaces double <BR> tags with <P> tags.
1649     replaceDoubleBrsWithPs: function(node) {
1650         var allElements = node.getElementsByTagName('BR');
1651         var node = null;
1652         while (allElements && allElements.length > 0 &&
1653                readability.hasDoubleBr(allElements)) {
1654             for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
1655                 var next = node;
1656                 while (next = readability.replaceDoubleBrWithP(next));
1657             }
1658             allElements = document.body.getElementsByTagName('BR');
1659         }
1660     },
1661
1662
1663     // Replaces a BR and the whitespace that follows it with a P.
1664     replaceBrWithP: function(node) {
1665         if (!readability.isBrNode(node)) {
1666             return;
1667         }
1668         var p = document.createElement('p');
1669         var curr = node.nextSibling;
1670         while (curr && !isBrNode(curr)) {
1671             var next = curr.nextSibling;
1672             if (readability.isWhitespaceNode(curr)) {
1673                 curr.parentNode.removeChild(curr);
1674             } else {
1675                 p.appendChild(curr.parentNode.removeChild(curr));
1676             }
1677             curr = next;
1678         }
1679         node.parentNode.replaceChild(p, node);
1680         return curr;
1681     },
1682
1683     // Replaces all <BR> tags with <P> tags. Makes all next siblings of a <BR> tag
1684     // children of the <P>.
1685     replaceBrsWithPs: function(node) {
1686         var allElements = node.getElementsByTagName('BR');
1687         var node = null;
1688         while (allElements && allElements.length > 0) {
1689             for (var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex += 1) {
1690                 var next = node;
1691                 while (next = readability.replaceBrWithP(next));
1692             }
1693             allElements = document.body.getElementsByTagName('BR');
1694         }
1695     },
1696
1697     // Replaces any tag with any other tag.
1698     replaceTagsWithTags: function(node, srcTag, destTag) {
1699         var allElements = node.getElementsByTagName(srcTag);
1700         for (var i = 0; i < allElements.length; i++) {
1701             var dest = document.createElement(destTag);
1702             readability.moveNodeInnards(allElements[i], dest);
1703             allElements[i].parentNode.replaceChild(dest, allElements[i]);
1704         }
1705     },
1706
1707     // Replaces all <noscript> tags with <p> tags.
1708     replaceNoscriptsWithPs: function(node) {
1709         readability.replaceTagsWithTags(node, 'noscript', 'p');
1710     },
1711
1712     // Replaces all <font> tags with <span> tags.
1713     replaceFontsWithSpans: function(node) {
1714         readability.replaceTagsWithTags(node, 'font', 'span');
1715     },
1716
1717     // Returns a list of image URLs in the distilled article.
1718     getImages : function() {
1719         var images = document.getElementsByTagName('img');
1720         var result = new Array(images.length);
1721         dbg("Number of images: " + images.length);
1722         for(i = 0; i < images.length; i++) {
1723             result[i] = images[i].src;
1724             dbg("Image: " + result[i]);
1725         }
1726         return result;
1727     },
1728
1729     // Returns the distilled article HTML from the page(s).
1730     getDistilledArticleHTML : function() {
1731         return readability.distilledHTML;
1732     },
1733
1734     // Returns the next page of this article.
1735     getNextPageLink : function() {
1736         return readability.nextPageLink;
1737     }
1738 };