extensions/spellcheck/src/mozInlineSpellWordUtil.h

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* ***** BEGIN LICENSE BLOCK *****
   3  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   4  *
   5  * The contents of this file are subject to the Mozilla Public License Version
   6  * 1.1 (the "License"); you may not use this file except in compliance with
   7  * the License. You may obtain a copy of the License at
   8  * http://www.mozilla.org/MPL/
   9  *
  10  * Software distributed under the License is distributed on an "AS IS" basis,
  11  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  12  * for the specific language governing rights and limitations under the
  13  * License.
  14  *
  15  * The Original Code is inline spellchecker code.
  16  *
  17  * The Initial Developer of the Original Code is Google Inc.
  18  * Portions created by the Initial Developer are Copyright (C) 2004-2006
  19  * the Initial Developer. All Rights Reserved.
  20  *
  21  * Contributor(s):
  22  *   Brett Wilson <brettw@gmail.com> (original author)
  23  *
  24  * Alternatively, the contents of this file may be used under the terms of
  25  * either the GNU General Public License Version 2 or later (the "GPL"), or
  26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27  * in which case the provisions of the GPL or the LGPL are applicable instead
  28  * of those above. If you wish to allow use of your version of this file only
  29  * under the terms of either the GPL or the LGPL, and not to allow others to
  30  * use your version of this file under the terms of the MPL, indicate your
  31  * decision by deleting the provisions above and replace them with the notice
  32  * and other provisions required by the GPL or the LGPL. If you do not delete
  33  * the provisions above, a recipient may use your version of this file under
  34  * the terms of any one of the MPL, the GPL or the LGPL.
  35  *
  36  * ***** END LICENSE BLOCK ***** */
  37
  38 #include "nsCOMPtr.h"
  39 #include "nsIDOMDocument.h"
  40 #include "nsIDOMDocumentRange.h"
  41 #include "nsIDOMViewCSS.h"
  42 #include "nsIDocument.h"
  43 #include "nsString.h"
  44 #include "nsTArray.h"
  45 #include "nsIUGenCategory.h"
  46
  47 //#define DEBUG_SPELLCHECK
  48
  49 class nsIDOMRange;
  50 class nsIDOMNode;
  51
  52 /**
  53  *    This class extracts text from the DOM and builds it into a single string.
  54  *    The string includes whitespace breaks whereever non-inline elements begin
  55  *    and end. This string is broken into "real words", following somewhat
  56  *    complex rules; for example substrings that look like URLs or
  57  *    email addresses are treated as single words, but otherwise many kinds of
  58  *    punctuation are treated as word separators. GetNextWord provides a way
  59  *    to iterate over these "real words".
  60  *
  61  *    The basic operation is:
  62  *
  63  *    1. Call Init with the weak pointer to the editor that you're using.
  64  *    2. Call SetEnd to set where you want to stop spellchecking. We'll stop
  65  *       at the word boundary after that. If SetEnd is not called, we'll stop
  66  *       at the end of the document's root element.
  67  *    3. Call SetPosition to initialize the current position inside the
  68  *       previously given range.
  69  *    4. Call GetNextWord over and over until it returns false.
  70  */
  71
  72 class mozInlineSpellWordUtil
  73 {
  74 public:
  75   struct NodeOffset {
  76     nsIDOMNode* mNode;
  77     PRInt32     mOffset;
  78
  79     NodeOffset(nsIDOMNode* aNode, PRInt32 aOffset) :
  80       mNode(aNode), mOffset(aOffset) {}
  81   };
  82
  83   mozInlineSpellWordUtil()
  84     : mRootNode(nsnull),
  85       mSoftBegin(nsnull, 0), mSoftEnd(nsnull, 0),
  86       mNextWordIndex(-1), mSoftTextValid(PR_FALSE) {}
  87
  88   nsresult Init(nsWeakPtr aWeakEditor);
  89
  90   nsresult SetEnd(nsIDOMNode* aEndNode, PRInt32 aEndOffset);
  91
  92   // sets the current position, this should be inside the range. If we are in
  93   // the middle of a word, we'll move to its start.
  94   nsresult SetPosition(nsIDOMNode* aNode, PRInt32 aOffset);
  95
  96   // Given a point inside or immediately following a word, this returns the
  97   // DOM range that exactly encloses that word's characters. The current
  98   // position will be at the end of the word. This will find the previous
  99   // word if the current position is space, so if you care that the point is
 100   // inside the word, you should check the range.
 101   //
 102   // THIS CHANGES THE CURRENT POSITION AND RANGE. It is designed to be called
 103   // before you actually generate the range you are interested in and iterate
 104   // the words in it.
 105   nsresult GetRangeForWord(nsIDOMNode* aWordNode, PRInt32 aWordOffset,
 106                            nsIDOMRange** aRange);
 107
 108   // Moves to the the next word in the range, and retrieves it's text and range.
 109   // An empty word and a NULL range are returned when we are done checking.
 110   // aSkipChecking will be set if the word is "special" and shouldn't be
 111   // checked (e.g., an email address).
 112   nsresult GetNextWord(nsAString& aText, nsIDOMRange** aRange,
 113                        PRBool* aSkipChecking);
 114
 115   // Call to normalize some punctuation. This function takes an autostring
 116   // so we can access characters directly.
 117   static void NormalizeWord(nsSubstring& aWord);
 118
 119   nsIDOMDocumentRange* GetDocumentRange() const { return mDOMDocumentRange; }
 120   nsIDocument* GetDocument() const { return mDocument; }
 121   nsIDOMNode* GetRootNode() { return mRootNode; }
 122   nsIUGenCategory* GetCategories() { return mCategories; }
 123
 124 private:
 125
 126   // cached stuff for the editor, set by Init
 127   nsCOMPtr<nsIDOMDocumentRange> mDOMDocumentRange;
 128   nsCOMPtr<nsIDocument>         mDocument;
 129   nsCOMPtr<nsIDOMViewCSS>       mCSSView;
 130   nsCOMPtr<nsIUGenCategory>     mCategories;
 131
 132   // range to check, see SetRange
 133   nsIDOMNode* mRootNode;
 134   NodeOffset  mSoftBegin;
 135   NodeOffset  mSoftEnd;
 136
 137   // DOM text covering the soft range, with newlines added at block boundaries
 138   nsString mSoftText;
 139   // A list of where we extracted text from, ordered by mSoftTextOffset. A given
 140   // DOM node appears at most once in this list.
 141   struct DOMTextMapping {
 142     NodeOffset mNodeOffset;
 143     PRInt32    mSoftTextOffset;
 144     PRInt32    mLength;
 145
 146     DOMTextMapping(NodeOffset aNodeOffset, PRInt32 aSoftTextOffset, PRInt32 aLength)
 147       : mNodeOffset(aNodeOffset), mSoftTextOffset(aSoftTextOffset),
 148         mLength(aLength) {}
 149   };
 150   nsTArray<DOMTextMapping> mSoftTextDOMMapping;
 151
 152   // A list of the "real words" in mSoftText, ordered by mSoftTextOffset
 153   struct RealWord {
 154     PRInt32      mSoftTextOffset;
 155     PRInt32      mLength;
 156     PRPackedBool mCheckableWord;
 157
 158     RealWord(PRInt32 aOffset, PRInt32 aLength, PRPackedBool aCheckable)
 159       : mSoftTextOffset(aOffset), mLength(aLength), mCheckableWord(aCheckable) {}
 160     PRInt32 EndOffset() const { return mSoftTextOffset + mLength; }
 161   };
 162   nsTArray<RealWord> mRealWords;
 163   PRInt32            mNextWordIndex;
 164
 165   PRPackedBool mSoftTextValid;
 166
 167   void InvalidateWords() { mSoftTextValid = PR_FALSE; }
 168   void EnsureWords();
 169
 170   PRInt32 MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset);
 171   // Map an offset into mSoftText to a DOM position. Note that two DOM positions
 172   // can map to the same mSoftText offset, e.g. given nodes A=aaaa and B=bbbb
 173   // forming aaaabbbb, (A,4) and (B,0) give the same string offset. So,
 174   // aHintBefore controls which position we return ... if aHint is eEnd
 175   // then the position indicates the END of a range so we return (A,4). Otherwise
 176   // the position indicates the START of a range so we return (B,0).
 177   enum DOMMapHint { HINT_BEGIN, HINT_END };
 178   NodeOffset MapSoftTextOffsetToDOMPosition(PRInt32 aSoftTextOffset,
 179                                             DOMMapHint aHint);
 180   // Finds the index of the real word containing aSoftTextOffset, or -1 if none
 181   // If it's exactly between two words, then if aHint is HINT_BEGIN, return the
 182   // later word (favouring the assumption that it's the BEGINning of a word),
 183   // otherwise return the earlier word (assuming it's the END of a word).
 184   // If aSearchForward is true, then if we don't find a word at the given
 185   // position, search forward until we do find a word and return that (if found).
 186   PRInt32 FindRealWordContaining(PRInt32 aSoftTextOffset, DOMMapHint aHint,
 187                                  PRBool aSearchForward);
 188
 189   // build mSoftText and mSoftTextDOMMapping
 190   void BuildSoftText();
 191   // Build mRealWords array
 192   void BuildRealWords();
 193
 194   void SplitDOMWord(PRInt32 aStart, PRInt32 aEnd);
 195
 196   // Convenience functions, object must be initialized
 197   nsresult MakeRange(NodeOffset aBegin, NodeOffset aEnd, nsIDOMRange** aRange);
 198   nsresult MakeRangeForWord(const RealWord& aWord, nsIDOMRange** aRange);
 199 };