extensions/spellcheck/src/mozInlineSpellWordUtil.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* ***** BEGIN LICENSE BLOCK *****
   3  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   4  *
   5  * The contents of this file are subject to the Mozilla Public License Version
   6  * 1.1 (the "License"); you may not use this file except in compliance with
   7  * the License. You may obtain a copy of the License at
   8  * http://www.mozilla.org/MPL/
   9  *
  10  * Software distributed under the License is distributed on an "AS IS" basis,
  11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12  * for the specific language governing rights and limitations under the
  13  * License.
  14  *
  15  * The Original Code is inline spellchecker code.
  16  *
  17  * The Initial Developer of the Original Code is Google Inc.
  18  * Portions created by the Initial Developer are Copyright (C) 2004-2006
  19  * the Initial Developer. All Rights Reserved.
  20  *
  21  * Contributor(s):
  22  *   Brett Wilson <brettw@gmail.com> (original author)
  23  *   Robert O'Callahan <rocallahan@novell.com>
  24  *
  25  * Alternatively, the contents of this file may be used under the terms of
  26  * either the GNU General Public License Version 2 or later (the "GPL"), or
  27  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  28  * in which case the provisions of the GPL or the LGPL are applicable instead
  29  * of those above. If you wish to allow use of your version of this file only
  30  * under the terms of either the GPL or the LGPL, and not to allow others to
  31  * use your version of this file under the terms of the MPL, indicate your
  32  * decision by deleting the provisions above and replace them with the notice
  33  * and other provisions required by the GPL or the LGPL. If you do not delete
  34  * the provisions above, a recipient may use your version of this file under
  35  * the terms of any one of the MPL, the GPL or the LGPL.
  36  *
  37  * ***** END LICENSE BLOCK ***** */
  38
  39 #include "mozInlineSpellWordUtil.h"
  40 #include "nsDebug.h"
  41 #include "nsIAtom.h"
  42 #include "nsComponentManagerUtils.h"
  43 #include "nsIDOMCSSStyleDeclaration.h"
  44 #include "nsIDOMDocumentView.h"
  45 #include "nsIDOMElement.h"
  46 #include "nsIDOMNSRange.h"
  47 #include "nsIDOMRange.h"
  48 #include "nsIEditor.h"
  49 #include "nsIDOMNode.h"
  50 #include "nsIDOMHTMLBRElement.h"
  51 #include "nsUnicharUtilCIID.h"
  52 #include "nsServiceManagerUtils.h"
  53
  54 // IsIgnorableCharacter
  55 //
  56 //    These characters are ones that we should ignore in input.
  57
  58 inline PRBool IsIgnorableCharacter(PRUnichar ch)
  59 {
  60   return (ch == 0x200D || // ZERO-WIDTH JOINER
  61           ch == 0xAD ||   // SOFT HYPHEN
  62           ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN
  63 }
  64
  65 // IsConditionalPunctuation
  66 //
  67 //    Some characters (like apostrophes) require characters on each side to be
  68 //    part of a word, and are otherwise punctuation.
  69
  70 inline PRBool IsConditionalPunctuation(PRUnichar ch)
  71 {
  72   return (ch == '\'' ||
  73           ch == 0x2019); // RIGHT SINGLE QUOTATION MARK
  74 }
  75
  76 // mozInlineSpellWordUtil::Init
  77
  78 nsresult
  79 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)
  80 {
  81   nsresult rv;
  82
  83   mCategories = do_GetService(NS_UNICHARCATEGORY_CONTRACTID, &rv);
  84   if (NS_FAILED(rv))
  85     return rv;
  86
  87   // getting the editor can fail commonly because the editor was detached, so
  88   // don't assert
  89   nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
  90   if (NS_FAILED(rv))
  91     return rv;
  92
  93   nsCOMPtr<nsIDOMDocument> domDoc;
  94   rv = editor->GetDocument(getter_AddRefs(domDoc));
  95   NS_ENSURE_SUCCESS(rv, rv);
  96
  97   mDocument = do_QueryInterface(domDoc, &rv);
  98   NS_ENSURE_SUCCESS(rv, rv);
  99
 100   mDOMDocumentRange = do_QueryInterface(domDoc, &rv);
 101   NS_ENSURE_SUCCESS(rv, rv);
 102
 103   // view
 104   nsCOMPtr<nsIDOMDocumentView> docView = do_QueryInterface(domDoc, &rv);
 105   NS_ENSURE_SUCCESS(rv, rv);
 106   nsCOMPtr<nsIDOMAbstractView> abstractView;
 107   rv = docView->GetDefaultView(getter_AddRefs(abstractView));
 108   NS_ENSURE_SUCCESS(rv, rv);
 109   mCSSView = do_QueryInterface(abstractView, &rv);
 110   NS_ENSURE_SUCCESS(rv, rv);
 111
 112   // Find the root node for the editor. For contenteditable we'll need something
 113   // cleverer here.
 114   nsCOMPtr<nsIDOMElement> rootElt;
 115   rv = editor->GetRootElement(getter_AddRefs(rootElt));
 116   NS_ENSURE_SUCCESS(rv, rv);
 117
 118   mRootNode = rootElt;
 119   NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
 120   return NS_OK;
 121 }
 122
 123 static PRBool
 124 IsTextNode(nsIDOMNode* aNode)
 125 {
 126   PRUint16 type = 0;
 127   aNode->GetNodeType(&type);
 128   return type == nsIDOMNode::TEXT_NODE;
 129 }
 130
 131 typedef void (* OnLeaveNodeFunPtr)(nsIDOMNode* aNode, void* aClosure);
 132
 133 // Find the next node in the DOM tree in preorder. This isn't fast because
 134 // one call to GetNextSibling can be O(N) in the number of siblings...
 135 // Calls OnLeaveNodeFunPtr when the traversal leaves a node
 136 static nsIDOMNode*
 137 FindNextNode(nsIDOMNode* aNode, nsIDOMNode* aRoot,
 138              OnLeaveNodeFunPtr aOnLeaveNode = nsnull, void* aClosure = nsnull)
 139 {
 140   NS_PRECONDITION(aNode, "Null starting node?");
 141
 142   nsCOMPtr<nsIDOMNode> next;
 143   aNode->GetFirstChild(getter_AddRefs(next));
 144   if (next)
 145     return next;
 146
 147   // Don't look at siblings or otherwise outside of aRoot
 148   if (aNode == aRoot)
 149     return nsnull;
 150
 151   aNode->GetNextSibling(getter_AddRefs(next));
 152   if (next)
 153     return next;
 154
 155   // Go up
 156   for (;;) {
 157     if (aOnLeaveNode) {
 158       aOnLeaveNode(aNode, aClosure);
 159     }
 160
 161     aNode->GetParentNode(getter_AddRefs(next));
 162     if (next == aRoot || ! next)
 163       return nsnull;
 164     aNode = next;
 165
 166     aNode->GetNextSibling(getter_AddRefs(next));
 167     if (next)
 168       return next;
 169   }
 170 }
 171
 172 // aNode is not a text node. Find the first text node starting at aNode/aOffset
 173 // in a preorder DOM traversal.
 174 static nsIDOMNode*
 175 FindNextTextNode(nsIDOMNode* aNode, PRInt32 aOffset, nsIDOMNode* aRoot)
 176 {
 177   NS_PRECONDITION(aNode, "Null starting node?");
 178   NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");
 179
 180   nsIDOMNode* checkNode;
 181   // Need to start at the aOffset'th child
 182   nsCOMPtr<nsIDOMNode> child;
 183   aNode->GetFirstChild(getter_AddRefs(child));
 184   while (child && aOffset > 0) {
 185     nsCOMPtr<nsIDOMNode> next;
 186     child->GetNextSibling(getter_AddRefs(next));
 187     child.swap(next);
 188     --aOffset;
 189   }
 190   if (child) {
 191     checkNode = child;
 192   } else {
 193     // aOffset was beyond the end of the child list.
 194     // goto next node in a preorder DOM traversal.
 195     nsCOMPtr<nsIDOMNode> next;
 196     aNode->GetNextSibling(getter_AddRefs(next));
 197     while (!next) {
 198       // Go up
 199       aNode->GetParentNode(getter_AddRefs(next));
 200       if (next == aRoot || !next) {
 201         return nsnull;
 202       }
 203       aNode = next;
 204       aNode->GetNextSibling(getter_AddRefs(next));
 205     }
 206     checkNode = next;
 207   }
 208
 209   while (checkNode && !IsTextNode(checkNode)) {
 210     checkNode = FindNextNode(checkNode, aRoot);
 211   }
 212   return checkNode;
 213 }
 214
 215 // mozInlineSpellWordUtil::SetEnd
 216 //
 217 //    We have two ranges "hard" and "soft". The hard boundary is simply
 218 //    the scope of the root node. The soft boundary is that which is set
 219 //    by the caller of this class by calling this function. If this function is
 220 //    not called, the soft boundary is the same as the hard boundary.
 221 //
 222 //    When we reach the soft boundary (mSoftEnd), we keep
 223 //    going until we reach the end of a word. This allows the caller to set the
 224 //    end of the range to anything, and we will always check whole multiples of
 225 //    words. When we reach the hard boundary we stop no matter what.
 226 //
 227 //    There is no beginning soft boundary. This is because we only go to the
 228 //    previous node once, when finding the previous word boundary in
 229 //    SetPosition(). You might think of the soft boundary as being this initial
 230 //    position.
 231
 232 nsresult
 233 mozInlineSpellWordUtil::SetEnd(nsIDOMNode* aEndNode, PRInt32 aEndOffset)
 234 {
 235   NS_PRECONDITION(aEndNode, "Null end node?");
 236
 237   NS_ASSERTION(mRootNode, "Not initialized");
 238
 239   InvalidateWords();
 240
 241   if (!IsTextNode(aEndNode)) {
 242     // End at the start of the first text node after aEndNode/aEndOffset.
 243     aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
 244     aEndOffset = 0;
 245   }
 246   mSoftEnd = NodeOffset(aEndNode, aEndOffset);
 247   return NS_OK;
 248 }
 249
 250 nsresult
 251 mozInlineSpellWordUtil::SetPosition(nsIDOMNode* aNode, PRInt32 aOffset)
 252 {
 253   InvalidateWords();
 254
 255   if (!IsTextNode(aNode)) {
 256     // Start at the start of the first text node after aNode/aOffset.
 257     aNode = FindNextTextNode(aNode, aOffset, mRootNode);
 258     aOffset = 0;
 259   }
 260   mSoftBegin = NodeOffset(aNode, aOffset);
 261
 262   EnsureWords();
 263
 264   PRInt32 textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
 265   if (textOffset < 0)
 266     return NS_OK;
 267   mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, PR_TRUE);
 268   return NS_OK;
 269 }
 270
 271 void
 272 mozInlineSpellWordUtil::EnsureWords()
 273 {
 274   if (mSoftTextValid)
 275     return;
 276   BuildSoftText();
 277   BuildRealWords();
 278   mSoftTextValid = PR_TRUE;
 279 }
 280
 281 nsresult
 282 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsIDOMRange** aRange)
 283 {
 284   NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
 285   NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
 286   return MakeRange(begin, end, aRange);
 287 }
 288
 289 // mozInlineSpellWordUtil::GetRangeForWord
 290
 291 nsresult
 292 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
 293                                         PRInt32 aWordOffset,
 294                                         nsIDOMRange** aRange)
 295 {
 296   // Set our soft end and start
 297   NodeOffset pt = NodeOffset(aWordNode, aWordOffset);
 298
 299   InvalidateWords();
 300   mSoftBegin = mSoftEnd = pt;
 301   EnsureWords();
 302
 303   PRInt32 offset = MapDOMPositionToSoftTextOffset(pt);
 304   if (offset < 0)
 305     return MakeRange(pt, pt, aRange);
 306   PRInt32 wordIndex = FindRealWordContaining(offset, HINT_BEGIN, PR_FALSE);
 307   if (wordIndex < 0)
 308     return MakeRange(pt, pt, aRange);
 309   return MakeRangeForWord(mRealWords[wordIndex], aRange);
 310 }
 311
 312 // This is to fix characters that the spellchecker may not like
 313 static void
 314 NormalizeWord(const nsSubstring& aInput, PRInt32 aPos, PRInt32 aLen, nsAString& aOutput)
 315 {
 316   aOutput.Truncate();
 317   for (PRInt32 i = 0; i < aLen; i++) {
 318     PRUnichar ch = aInput.CharAt(i + aPos);
 319
 320     // remove ignorable characters from the word
 321     if (IsIgnorableCharacter(ch))
 322       continue;
 323
 324     // the spellchecker doesn't handle curly apostrophes in all languages
 325     if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
 326       ch = '\'';
 327     }
 328
 329     aOutput.Append(ch);
 330   }
 331 }
 332
 333 // mozInlineSpellWordUtil::GetNextWord
 334 //
 335 //    FIXME-optimization: we shouldn't have to generate a range every single
 336 //    time. It would be better if the inline spellchecker didn't require a
 337 //    range unless the word was misspelled. This may or may not be possible.
 338
 339 nsresult
 340 mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsIDOMRange** aRange,
 341                                     PRBool* aSkipChecking)
 342 {
 343 #ifdef DEBUG_SPELLCHECK
 344   printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
 345 #endif
 346
 347   if (mNextWordIndex < 0 ||
 348       mNextWordIndex >= PRInt32(mRealWords.Length())) {
 349     mNextWordIndex = -1;
 350     *aRange = nsnull;
 351     *aSkipChecking = PR_TRUE;
 352     return NS_OK;
 353   }
 354
 355   const RealWord& word = mRealWords[mNextWordIndex];
 356   nsresult rv = MakeRangeForWord(word, aRange);
 357   NS_ENSURE_SUCCESS(rv, rv);
 358   ++mNextWordIndex;
 359   *aSkipChecking = !word.mCheckableWord;
 360   ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
 361
 362 #ifdef DEBUG_SPELLCHECK
 363   printf("GetNextWord returning: %s (skip=%d)\n",
 364          NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
 365 #endif
 366
 367   return NS_OK;
 368 }
 369
 370 // mozInlineSpellWordUtil::MakeRange
 371 //
 372 //    Convenience function for creating a range over the current document.
 373
 374 nsresult
 375 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
 376                                   nsIDOMRange** aRange)
 377 {
 378   if (! mDOMDocumentRange)
 379     return NS_ERROR_NOT_INITIALIZED;
 380
 381   nsresult rv = mDOMDocumentRange->CreateRange(aRange);
 382   NS_ENSURE_SUCCESS(rv, rv);
 383
 384   rv = (*aRange)->SetStart(aBegin.mNode, aBegin.mOffset);
 385   NS_ENSURE_SUCCESS(rv, rv);
 386   rv = (*aRange)->SetEnd(aEnd.mNode, aEnd.mOffset);
 387   NS_ENSURE_SUCCESS(rv, rv);
 388
 389   return NS_OK;
 390 }
 391
 392 /*********** DOM text extraction ************/
 393
 394 // IsDOMWordSeparator
 395 //
 396 //    Determines if the given character should be considered as a DOM Word
 397 //    separator. Basically, this is whitespace, although it could also have
 398 //    certain punctuation that we know ALWAYS breaks words. This is important.
 399 //    For example, we can't have any punctuation that could appear in a URL
 400 //    or email address in this, because those need to always fit into a single
 401 //    DOM word.
 402
 403 static PRBool
 404 IsDOMWordSeparator(PRUnichar ch)
 405 {
 406   // simple spaces
 407   if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
 408     return PR_TRUE;
 409
 410   // complex spaces - check only if char isn't ASCII (uncommon)
 411   if (ch >= 0xA0 &&
 412       (ch == 0x00A0 ||  // NO-BREAK SPACE
 413        ch == 0x2002 ||  // EN SPACE
 414        ch == 0x2003 ||  // EM SPACE
 415        ch == 0x2009 ||  // THIN SPACE
 416        ch == 0x200C ||  // ZERO WIDTH NON-JOINER
 417        ch == 0x3000))   // IDEOGRAPHIC SPACE
 418     return PR_TRUE;
 419
 420   // otherwise not a space
 421   return PR_FALSE;
 422 }
 423
 424 static PRBool
 425 IsBRElement(nsIDOMNode* aNode)
 426 {
 427   nsresult rv;
 428   nsCOMPtr<nsIDOMHTMLBRElement> elt = do_QueryInterface(aNode, &rv);
 429   return NS_SUCCEEDED(rv);
 430 }
 431
 432 static void
 433 GetNodeText(nsIDOMNode* aNode, nsAutoString& aText)
 434 {
 435   nsresult rv = aNode->GetNodeValue(aText);
 436   NS_ASSERTION(NS_SUCCEEDED(rv), "Unable to get node text");
 437 }
 438
 439 // Find the previous node in the DOM tree in preorder. This isn't fast because
 440 // one call to GetPrevSibling can be O(N) in the number of siblings...
 441 static nsIDOMNode*
 442 FindPrevNode(nsIDOMNode* aNode, nsIDOMNode* aRoot)
 443 {
 444   if (aNode == aRoot)
 445     return nsnull;
 446
 447   nsCOMPtr<nsIDOMNode> prev;
 448   aNode->GetPreviousSibling(getter_AddRefs(prev));
 449   if (prev) {
 450     for (;;) {
 451       nsCOMPtr<nsIDOMNode> lastChild;
 452       prev->GetLastChild(getter_AddRefs(lastChild));
 453       if (!lastChild)
 454         return prev;
 455       prev = lastChild;
 456     }
 457   }
 458
 459   // No prev sibling. So we are the first child of our parent, if any. Our
 460   // parent is our previous node.
 461   aNode->GetParentNode(getter_AddRefs(prev));
 462   return prev;
 463 }
 464
 465 /**
 466  * Check if there's a DOM word separator before aBeforeOffset in this node.
 467  * Always returns PR_TRUE if it's a BR element.
 468  * aSeparatorOffset is set to the index of the last separator if any is found
 469  * (0 for BR elements).
 470  */
 471 static PRBool
 472 ContainsDOMWordSeparator(nsIDOMNode* aNode, PRInt32 aBeforeOffset,
 473                          PRInt32* aSeparatorOffset)
 474 {
 475   if (IsBRElement(aNode)) {
 476     *aSeparatorOffset = 0;
 477     return PR_TRUE;
 478   }
 479
 480   if (!IsTextNode(aNode))
 481     return PR_FALSE;
 482
 483   nsAutoString str;
 484   GetNodeText(aNode, str);
 485   for (PRInt32 i = PR_MIN(aBeforeOffset, PRInt32(str.Length())) - 1; i >= 0; --i) {
 486     if (IsDOMWordSeparator(str.CharAt(i))) {
 487       *aSeparatorOffset = i;
 488       return PR_TRUE;
 489     }
 490   }
 491   return PR_FALSE;
 492 }
 493
 494 static PRBool
 495 IsBreakElement(nsIDOMViewCSS* aDocView, nsIDOMNode* aNode)
 496 {
 497   nsCOMPtr<nsIDOMElement> element = do_QueryInterface(aNode);
 498   if (!element)
 499     return PR_FALSE;
 500
 501   if (IsBRElement(aNode))
 502     return PR_TRUE;
 503
 504   nsCOMPtr<nsIDOMCSSStyleDeclaration> style;
 505   aDocView->GetComputedStyle(element, EmptyString(), getter_AddRefs(style));
 506   if (!style)
 507     return PR_FALSE;
 508
 509 #ifdef DEBUG_SPELLCHECK
 510   printf("    searching element %p\n", (void*)aNode);
 511 #endif
 512
 513   nsAutoString display;
 514   style->GetPropertyValue(NS_LITERAL_STRING("display"), display);
 515 #ifdef DEBUG_SPELLCHECK
 516   printf("      display=\"%s\"\n", NS_ConvertUTF16toUTF8(display).get());
 517 #endif
 518   if (!display.EqualsLiteral("inline"))
 519     return PR_TRUE;
 520
 521   nsAutoString position;
 522   style->GetPropertyValue(NS_LITERAL_STRING("position"), position);
 523 #ifdef DEBUG_SPELLCHECK
 524   printf("      position=%s\n", NS_ConvertUTF16toUTF8(position).get());
 525 #endif
 526   if (!position.EqualsLiteral("static"))
 527     return PR_TRUE;
 528
 529   // XXX What about floats? What else?
 530   return PR_FALSE;
 531 }
 532
 533 struct CheckLeavingBreakElementClosure {
 534   nsIDOMViewCSS* mDocView;
 535   PRPackedBool   mLeftBreakElement;
 536 };
 537
 538 static void
 539 CheckLeavingBreakElement(nsIDOMNode* aNode, void* aClosure)
 540 {
 541   CheckLeavingBreakElementClosure* cl =
 542     static_cast<CheckLeavingBreakElementClosure*>(aClosure);
 543   if (!cl->mLeftBreakElement && IsBreakElement(cl->mDocView, aNode)) {
 544     cl->mLeftBreakElement = PR_TRUE;
 545   }
 546 }
 547
 548 void
 549 mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)
 550 {
 551   nsAutoString result;
 552   ::NormalizeWord(aWord, 0, aWord.Length(), result);
 553   aWord = result;
 554 }
 555
 556 void
 557 mozInlineSpellWordUtil::BuildSoftText()
 558 {
 559   // First we have to work backwards from mSoftStart to find a text node
 560   // containing a DOM word separator, a non-inline-element
 561   // boundary, or the hard start node. That's where we'll start building the
 562   // soft string from.
 563   nsIDOMNode* node = mSoftBegin.mNode;
 564   PRInt32 firstOffsetInNode = 0;
 565   PRInt32 checkBeforeOffset = mSoftBegin.mOffset;
 566   while (node) {
 567     if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode))
 568       break;
 569     checkBeforeOffset = PR_INT32_MAX;
 570     if (IsBreakElement(mCSSView, node)) {
 571       // Since FindPrevNode follows tree *preorder*, we're about to traverse
 572       // up out of 'node'. Since node induces breaks (e.g., it's a block),
 573       // don't bother trying to look outside it, just stop now.
 574       break;
 575     }
 576     node = FindPrevNode(node, mRootNode);
 577   }
 578
 579   // Now build up the string moving forward through the DOM until we reach
 580   // the soft end and *then* see a DOM word separator, a non-inline-element
 581   // boundary, or the hard end node.
 582   mSoftText.Truncate();
 583   mSoftTextDOMMapping.Clear();
 584   PRBool seenSoftEnd = PR_FALSE;
 585   // Leave this outside the loop so large heap string allocations can be reused
 586   // across iterations
 587   nsAutoString str;
 588   while (node) {
 589     if (node == mSoftEnd.mNode) {
 590       seenSoftEnd = PR_TRUE;
 591     }
 592
 593     PRBool exit = PR_FALSE;
 594     if (IsTextNode(node)) {
 595       GetNodeText(node, str);
 596       PRInt32 lastOffsetInNode = str.Length();
 597
 598       if (seenSoftEnd) {
 599         // check whether we can stop after this
 600         for (PRInt32 i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
 601              i < PRInt32(str.Length()); ++i) {
 602           if (IsDOMWordSeparator(str.CharAt(i))) {
 603             exit = PR_TRUE;
 604             // stop at the first separator after the soft end point
 605             lastOffsetInNode = i;
 606             break;
 607           }
 608         }
 609       }
 610
 611       if (firstOffsetInNode < lastOffsetInNode) {
 612         PRInt32 len = lastOffsetInNode - firstOffsetInNode;
 613         mSoftTextDOMMapping.AppendElement(
 614           DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
 615         mSoftText.Append(Substring(str, firstOffsetInNode, len));
 616       }
 617
 618       firstOffsetInNode = 0;
 619     }
 620
 621     if (exit)
 622       break;
 623
 624     CheckLeavingBreakElementClosure closure = { mCSSView, PR_FALSE };
 625     node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
 626     if (closure.mLeftBreakElement || (node && IsBreakElement(mCSSView, node))) {
 627       // We left, or are entering, a break element (e.g., block). Maybe we can
 628       // stop now.
 629       if (seenSoftEnd)
 630         break;
 631       // Record the break
 632       mSoftText.Append(' ');
 633     }
 634   }
 635
 636 #ifdef DEBUG_SPELLCHECK
 637   printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
 638 #endif
 639 }
 640
 641 void
 642 mozInlineSpellWordUtil::BuildRealWords()
 643 {
 644   // This is pretty simple. We just have to walk mSoftText, tokenizing it
 645   // into "real words".
 646   // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
 647   // SplitDOMWord on each of those DOM words
 648   PRInt32 wordStart = -1;
 649   mRealWords.Clear();
 650   for (PRInt32 i = 0; i < PRInt32(mSoftText.Length()); ++i) {
 651     if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
 652       if (wordStart >= 0) {
 653         SplitDOMWord(wordStart, i);
 654         wordStart = -1;
 655       }
 656     } else {
 657       if (wordStart < 0) {
 658         wordStart = i;
 659       }
 660     }
 661   }
 662   if (wordStart >= 0) {
 663     SplitDOMWord(wordStart, mSoftText.Length());
 664   }
 665 }
 666
 667 /*********** DOM/realwords<->mSoftText mapping functions ************/
 668
 669 PRInt32
 670 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
 671 {
 672   if (!mSoftTextValid) {
 673     NS_ERROR("Soft text must be valid if we're to map into it");
 674     return -1;
 675   }
 676
 677   for (PRInt32 i = 0; i < PRInt32(mSoftTextDOMMapping.Length()); ++i) {
 678     const DOMTextMapping& map = mSoftTextDOMMapping[i];
 679     if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
 680       // Allow offsets at either end of the string, in particular, allow the
 681       // offset that's at the end of the contributed string
 682       PRInt32 offsetInContributedString =
 683         aNodeOffset.mOffset - map.mNodeOffset.mOffset;
 684       if (offsetInContributedString >= 0 &&
 685           offsetInContributedString <= map.mLength)
 686         return map.mSoftTextOffset + offsetInContributedString;
 687       return -1;
 688     }
 689   }
 690   return -1;
 691 }
 692
 693 mozInlineSpellWordUtil::NodeOffset
 694 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(PRInt32 aSoftTextOffset,
 695                                                        DOMMapHint aHint)
 696 {
 697   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
 698   if (!mSoftTextValid)
 699     return NodeOffset(nsnull, -1);
 700
 701   // The invariant is that the range start..end includes the last mapping,
 702   // if any, such that mSoftTextOffset <= aSoftTextOffset
 703   PRInt32 start = 0;
 704   PRInt32 end = mSoftTextDOMMapping.Length();
 705   while (end - start >= 2) {
 706     PRInt32 mid = (start + end)/2;
 707     const DOMTextMapping& map = mSoftTextDOMMapping[mid];
 708     if (map.mSoftTextOffset > aSoftTextOffset) {
 709       end = mid;
 710     } else {
 711       start = mid;
 712     }
 713   }
 714
 715   if (start >= end)
 716     return NodeOffset(nsnull, -1);
 717
 718   // 'start' is now the last mapping, if any, such that
 719   // mSoftTextOffset <= aSoftTextOffset.
 720   // If we're doing HINT_END, then we may want to return the end of the
 721   // the previous mapping instead of the start of this mapping
 722   if (aHint == HINT_END && start > 0) {
 723     const DOMTextMapping& map = mSoftTextDOMMapping[start - 1];
 724     if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
 725       return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
 726   }
 727
 728   // We allow ourselves to return the end of this mapping even if we're
 729   // doing HINT_START. This will only happen if there is no mapping which this
 730   // point is the start of. I'm not 100% sure this is OK...
 731   const DOMTextMapping& map = mSoftTextDOMMapping[start];
 732   PRInt32 offset = aSoftTextOffset - map.mSoftTextOffset;
 733   if (offset >= 0 && offset <= map.mLength)
 734     return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
 735
 736   return NodeOffset(nsnull, -1);
 737 }
 738
 739 PRInt32
 740 mozInlineSpellWordUtil::FindRealWordContaining(PRInt32 aSoftTextOffset,
 741     DOMMapHint aHint, PRBool aSearchForward)
 742 {
 743   NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
 744   if (!mSoftTextValid)
 745     return -1;
 746
 747   // The invariant is that the range start..end includes the last word,
 748   // if any, such that mSoftTextOffset <= aSoftTextOffset
 749   PRInt32 start = 0;
 750   PRInt32 end = mRealWords.Length();
 751   while (end - start >= 2) {
 752     PRInt32 mid = (start + end)/2;
 753     const RealWord& word = mRealWords[mid];
 754     if (word.mSoftTextOffset > aSoftTextOffset) {
 755       end = mid;
 756     } else {
 757       start = mid;
 758     }
 759   }
 760
 761   if (start >= end)
 762     return -1;
 763
 764   // 'start' is now the last word, if any, such that
 765   // mSoftTextOffset <= aSoftTextOffset.
 766   // If we're doing HINT_END, then we may want to return the end of the
 767   // the previous word instead of the start of this word
 768   if (aHint == HINT_END && start > 0) {
 769     const RealWord& word = mRealWords[start - 1];
 770     if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
 771       return start - 1;
 772   }
 773
 774   // We allow ourselves to return the end of this word even if we're
 775   // doing HINT_START. This will only happen if there is no word which this
 776   // point is the start of. I'm not 100% sure this is OK...
 777   const RealWord& word = mRealWords[start];
 778   PRInt32 offset = aSoftTextOffset - word.mSoftTextOffset;
 779   if (offset >= 0 && offset <= word.mLength)
 780     return start;
 781
 782   if (aSearchForward) {
 783     if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
 784       // All words have mSoftTextOffset > aSoftTextOffset
 785       return 0;
 786     }
 787     // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.
 788     // Word start+1, if it exists, will be the first with
 789     // mSoftTextOffset > aSoftTextOffset.
 790     if (start + 1 < PRInt32(mRealWords.Length()))
 791       return start + 1;
 792   }
 793
 794   return -1;
 795 }
 796
 797 /*********** Word Splitting ************/
 798
 799 // classifies a given character in the DOM word
 800 enum CharClass {
 801   CHAR_CLASS_WORD,
 802   CHAR_CLASS_SEPARATOR,
 803   CHAR_CLASS_END_OF_INPUT };
 804
 805 // Encapsulates DOM-word to real-word splitting
 806 struct WordSplitState
 807 {
 808   mozInlineSpellWordUtil*    mWordUtil;
 809   const nsDependentSubstring mDOMWordText;
 810   PRInt32                    mDOMWordOffset;
 811   CharClass                  mCurCharClass;
 812
 813   WordSplitState(mozInlineSpellWordUtil* aWordUtil,
 814                  const nsString& aString, PRInt32 aStart, PRInt32 aLen)
 815     : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
 816       mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
 817
 818   CharClass ClassifyCharacter(PRInt32 aIndex, PRBool aRecurse) const;
 819   void Advance();
 820   void AdvanceThroughSeparators();
 821   void AdvanceThroughWord();
 822
 823   // Finds special words like email addresses and URLs that may start at the
 824   // current position, and returns their length, or 0 if not found. This allows
 825   // arbitrary word breaking rules to be used for these special entities, as
 826   // long as they can not contain whitespace.
 827   PRInt32 FindSpecialWord();
 828
 829   // Similar to FindSpecialWord except that this takes a split word as
 830   // input. This checks for things that do not require special word-breaking
 831   // rules.
 832   PRBool ShouldSkipWord(PRInt32 aStart, PRInt32 aLength);
 833 };
 834
 835 // WordSplitState::ClassifyCharacter
 836
 837 CharClass
 838 WordSplitState::ClassifyCharacter(PRInt32 aIndex, PRBool aRecurse) const
 839 {
 840   NS_ASSERTION(aIndex >= 0 && aIndex <= PRInt32(mDOMWordText.Length()),
 841                "Index out of range");
 842   if (aIndex == PRInt32(mDOMWordText.Length()))
 843     return CHAR_CLASS_SEPARATOR;
 844
 845   // this will classify the character, we want to treat "ignorable" characters
 846   // such as soft hyphens as word characters.
 847   nsIUGenCategory::nsUGenCategory
 848     charCategory = mWordUtil->GetCategories()->Get(PRUint32(mDOMWordText[aIndex]));
 849   if (charCategory == nsIUGenCategory::kLetter ||
 850       IsIgnorableCharacter(mDOMWordText[aIndex]))
 851     return CHAR_CLASS_WORD;
 852
 853   // If conditional punctuation is surrounded immediately on both sides by word
 854   // characters it also counts as a word character.
 855   if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
 856     if (!aRecurse) {
 857       // not allowed to look around, this punctuation counts like a separator
 858       return CHAR_CLASS_SEPARATOR;
 859     }
 860
 861     // check the left-hand character
 862     if (aIndex == 0)
 863       return CHAR_CLASS_SEPARATOR;
 864     if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
 865       return CHAR_CLASS_SEPARATOR;
 866
 867     // now we know left char is a word-char, check the right-hand character
 868     if (aIndex == PRInt32(mDOMWordText.Length()) - 1)
 869       return CHAR_CLASS_SEPARATOR;
 870     if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
 871       return CHAR_CLASS_SEPARATOR;
 872
 873     // char on either side is a word, this counts as a word
 874     return CHAR_CLASS_WORD;
 875   }
 876
 877   // all other punctuation
 878   if (charCategory == nsIUGenCategory::kSeparator ||
 879       charCategory == nsIUGenCategory::kOther ||
 880       charCategory == nsIUGenCategory::kPunctuation ||
 881       charCategory == nsIUGenCategory::kSymbol)
 882     return CHAR_CLASS_SEPARATOR;
 883
 884   // any other character counts as a word
 885   return CHAR_CLASS_WORD;
 886 }
 887
 888
 889 // WordSplitState::Advance
 890
 891 void
 892 WordSplitState::Advance()
 893 {
 894   NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
 895   NS_ASSERTION(mDOMWordOffset < (PRInt32)mDOMWordText.Length(),
 896                "Length beyond end");
 897
 898   mDOMWordOffset ++;
 899   if (mDOMWordOffset >= (PRInt32)mDOMWordText.Length())
 900     mCurCharClass = CHAR_CLASS_END_OF_INPUT;
 901   else
 902     mCurCharClass = ClassifyCharacter(mDOMWordOffset, PR_TRUE);
 903 }
 904
 905
 906 // WordSplitState::AdvanceThroughSeparators
 907
 908 void
 909 WordSplitState::AdvanceThroughSeparators()
 910 {
 911   while (mCurCharClass == CHAR_CLASS_SEPARATOR)
 912     Advance();
 913 }
 914
 915 // WordSplitState::AdvanceThroughWord
 916
 917 void
 918 WordSplitState::AdvanceThroughWord()
 919 {
 920   while (mCurCharClass == CHAR_CLASS_WORD)
 921     Advance();
 922 }
 923
 924
 925 // WordSplitState::FindSpecialWord
 926
 927 PRInt32
 928 WordSplitState::FindSpecialWord()
 929 {
 930   PRInt32 i;
 931
 932   // Search for email addresses. We simply define these as any sequence of
 933   // characters with an '@' character in the middle. The DOM word is already
 934   // split on whitepace, so we know that everything to the end is the address
 935   //
 936   // Also look for periods, this tells us if we want to run the URL finder.
 937   PRBool foundDot = PR_FALSE;
 938   PRInt32 firstColon = -1;
 939   for (i = mDOMWordOffset;
 940        i < PRInt32(mDOMWordText.Length()); i ++) {
 941     if (mDOMWordText[i] == '@') {
 942       // only accept this if there are unambiguous word characters (don't bother
 943       // recursing to disambiguate apostrophes) on each side. This prevents
 944       // classifying, e.g. "@home" as an email address
 945
 946       // Use this condition to only accept words with '@' in the middle of
 947       // them. It works, but the inlinespellcker doesn't like this. The problem
 948       // is that you type "fhsgfh@" that's a misspelled word followed by a
 949       // symbol, but when you type another letter "fhsgfh@g" that first word
 950       // need to be unmarked misspelled. It doesn't do this. it only checks the
 951       // current position for potentially removing a spelling range.
 952       if (i > 0 && ClassifyCharacter(i - 1, PR_FALSE) == CHAR_CLASS_WORD &&
 953           i < (PRInt32)mDOMWordText.Length() - 1 &&
 954           ClassifyCharacter(i + 1, PR_FALSE) == CHAR_CLASS_WORD)
 955
 956       return mDOMWordText.Length() - mDOMWordOffset;
 957     } else if (mDOMWordText[i] == '.' && ! foundDot &&
 958         i > 0 && i < (PRInt32)mDOMWordText.Length() - 1) {
 959       // we found a period not at the end, we should check harder for URLs
 960       foundDot = PR_TRUE;
 961     } else if (mDOMWordText[i] == ':' && firstColon < 0) {
 962       firstColon = i;
 963     }
 964   }
 965
 966   // If the first colon is followed by a slash, consider it a URL
 967   // This will catch things like asdf://foo.com
 968   if (firstColon >= 0 && firstColon < (PRInt32)mDOMWordText.Length() - 1 &&
 969       mDOMWordText[firstColon + 1] == '/') {
 970     return mDOMWordText.Length() - mDOMWordOffset;
 971   }
 972
 973   // Check the text before the first colon against some known protocols. It
 974   // is impossible to check against all protocols, especially since you can
 975   // plug in new protocols. We also don't want to waste time here checking
 976   // against a lot of obscure protocols.
 977   if (firstColon > mDOMWordOffset) {
 978     nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
 979                       firstColon - mDOMWordOffset));
 980     if (protocol.EqualsIgnoreCase("http") ||
 981         protocol.EqualsIgnoreCase("https") ||
 982         protocol.EqualsIgnoreCase("news") ||
 983         protocol.EqualsIgnoreCase("ftp") ||
 984         protocol.EqualsIgnoreCase("file") ||
 985         protocol.EqualsIgnoreCase("javascript") ||
 986         protocol.EqualsIgnoreCase("ftp")) {
 987       return mDOMWordText.Length() - mDOMWordOffset;
 988     }
 989   }
 990
 991   // not anything special
 992   return -1;
 993 }
 994
 995 // WordSplitState::ShouldSkipWord
 996
 997 PRBool
 998 WordSplitState::ShouldSkipWord(PRInt32 aStart, PRInt32 aLength)
 999 {
1000   PRInt32 last = aStart + aLength;
1001
1002   // check to see if the word contains a digit
1003   for (PRInt32 i = aStart; i < last; i ++) {
1004     PRUnichar ch = mDOMWordText[i];
1005     // XXX Shouldn't this be something a lot more complex, Unicode-based?
1006     if (ch >= '0' && ch <= '9')
1007       return PR_TRUE;
1008   }
1009
1010   // not special
1011   return PR_FALSE;
1012 }
1013
1014 // mozInlineSpellWordUtil::SplitDOMWord
1015
1016 void
1017 mozInlineSpellWordUtil::SplitDOMWord(PRInt32 aStart, PRInt32 aEnd)
1018 {
1019   WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
1020   state.mCurCharClass = state.ClassifyCharacter(0, PR_TRUE);
1021
1022   while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
1023     state.AdvanceThroughSeparators();
1024     if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
1025       break;
1026
1027     PRInt32 specialWordLength = state.FindSpecialWord();
1028     if (specialWordLength > 0) {
1029       mRealWords.AppendElement(
1030         RealWord(aStart + state.mDOMWordOffset, specialWordLength, PR_FALSE));
1031
1032       // skip the special word
1033       state.mDOMWordOffset += specialWordLength;
1034       if (state.mDOMWordOffset + aStart >= aEnd)
1035         state.mCurCharClass = CHAR_CLASS_END_OF_INPUT;
1036       else
1037         state.mCurCharClass = state.ClassifyCharacter(state.mDOMWordOffset, PR_TRUE);
1038       continue;
1039     }
1040
1041     // save the beginning of the word
1042     PRInt32 wordOffset = state.mDOMWordOffset;
1043
1044     // find the end of the word
1045     state.AdvanceThroughWord();
1046     PRInt32 wordLen = state.mDOMWordOffset - wordOffset;
1047     mRealWords.AppendElement(
1048       RealWord(aStart + wordOffset, wordLen,
1049                !state.ShouldSkipWord(wordOffset, wordLen)));
1050   }
1051 }