1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
15 * The Original Code is inline spellchecker code.
17 * The Initial Developer of the Original Code is Google Inc.
18 * Portions created by the Initial Developer are Copyright (C) 2004-2006
19 * the Initial Developer. All Rights Reserved.
22 * Brett Wilson <brettw@gmail.com> (original author)
23 * Robert O'Callahan <rocallahan@novell.com>
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
37 * ***** END LICENSE BLOCK ***** */
39 #include "mozInlineSpellWordUtil.h"
42 #include "nsComponentManagerUtils.h"
43 #include "nsIDOMCSSStyleDeclaration.h"
44 #include "nsIDOMDocumentView.h"
45 #include "nsIDOMElement.h"
46 #include "nsIDOMNSRange.h"
47 #include "nsIDOMRange.h"
48 #include "nsIEditor.h"
49 #include "nsIDOMNode.h"
50 #include "nsIDOMHTMLBRElement.h"
51 #include "nsUnicharUtilCIID.h"
52 #include "nsServiceManagerUtils.h"
54 // IsIgnorableCharacter
56 // These characters are ones that we should ignore in input.
58 inline PRBool
IsIgnorableCharacter(PRUnichar ch
)
60 return (ch
== 0x200D || // ZERO-WIDTH JOINER
61 ch
== 0xAD || // SOFT HYPHEN
62 ch
== 0x1806); // MONGOLIAN TODO SOFT HYPHEN
65 // IsConditionalPunctuation
67 // Some characters (like apostrophes) require characters on each side to be
68 // part of a word, and are otherwise punctuation.
70 inline PRBool
IsConditionalPunctuation(PRUnichar ch
)
73 ch
== 0x2019); // RIGHT SINGLE QUOTATION MARK
76 // mozInlineSpellWordUtil::Init
79 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor
)
83 mCategories
= do_GetService(NS_UNICHARCATEGORY_CONTRACTID
, &rv
);
87 // getting the editor can fail commonly because the editor was detached, so
89 nsCOMPtr
<nsIEditor
> editor
= do_QueryReferent(aWeakEditor
, &rv
);
93 nsCOMPtr
<nsIDOMDocument
> domDoc
;
94 rv
= editor
->GetDocument(getter_AddRefs(domDoc
));
95 NS_ENSURE_SUCCESS(rv
, rv
);
97 mDocument
= do_QueryInterface(domDoc
, &rv
);
98 NS_ENSURE_SUCCESS(rv
, rv
);
100 mDOMDocumentRange
= do_QueryInterface(domDoc
, &rv
);
101 NS_ENSURE_SUCCESS(rv
, rv
);
104 nsCOMPtr
<nsIDOMDocumentView
> docView
= do_QueryInterface(domDoc
, &rv
);
105 NS_ENSURE_SUCCESS(rv
, rv
);
106 nsCOMPtr
<nsIDOMAbstractView
> abstractView
;
107 rv
= docView
->GetDefaultView(getter_AddRefs(abstractView
));
108 NS_ENSURE_SUCCESS(rv
, rv
);
109 mCSSView
= do_QueryInterface(abstractView
, &rv
);
110 NS_ENSURE_SUCCESS(rv
, rv
);
112 // Find the root node for the editor. For contenteditable we'll need something
114 nsCOMPtr
<nsIDOMElement
> rootElt
;
115 rv
= editor
->GetRootElement(getter_AddRefs(rootElt
));
116 NS_ENSURE_SUCCESS(rv
, rv
);
119 NS_ASSERTION(mRootNode
, "GetRootElement returned null *and* claimed to suceed!");
124 IsTextNode(nsIDOMNode
* aNode
)
127 aNode
->GetNodeType(&type
);
128 return type
== nsIDOMNode::TEXT_NODE
;
131 typedef void (* OnLeaveNodeFunPtr
)(nsIDOMNode
* aNode
, void* aClosure
);
133 // Find the next node in the DOM tree in preorder. This isn't fast because
134 // one call to GetNextSibling can be O(N) in the number of siblings...
135 // Calls OnLeaveNodeFunPtr when the traversal leaves a node
137 FindNextNode(nsIDOMNode
* aNode
, nsIDOMNode
* aRoot
,
138 OnLeaveNodeFunPtr aOnLeaveNode
= nsnull
, void* aClosure
= nsnull
)
140 NS_PRECONDITION(aNode
, "Null starting node?");
142 nsCOMPtr
<nsIDOMNode
> next
;
143 aNode
->GetFirstChild(getter_AddRefs(next
));
147 // Don't look at siblings or otherwise outside of aRoot
151 aNode
->GetNextSibling(getter_AddRefs(next
));
158 aOnLeaveNode(aNode
, aClosure
);
161 aNode
->GetParentNode(getter_AddRefs(next
));
162 if (next
== aRoot
|| ! next
)
166 aNode
->GetNextSibling(getter_AddRefs(next
));
172 // aNode is not a text node. Find the first text node starting at aNode/aOffset
173 // in a preorder DOM traversal.
175 FindNextTextNode(nsIDOMNode
* aNode
, PRInt32 aOffset
, nsIDOMNode
* aRoot
)
177 NS_PRECONDITION(aNode
, "Null starting node?");
178 NS_ASSERTION(!IsTextNode(aNode
), "FindNextTextNode should start with a non-text node");
180 nsIDOMNode
* checkNode
;
181 // Need to start at the aOffset'th child
182 nsCOMPtr
<nsIDOMNode
> child
;
183 aNode
->GetFirstChild(getter_AddRefs(child
));
184 while (child
&& aOffset
> 0) {
185 nsCOMPtr
<nsIDOMNode
> next
;
186 child
->GetNextSibling(getter_AddRefs(next
));
193 // aOffset was beyond the end of the child list.
194 // goto next node in a preorder DOM traversal.
195 nsCOMPtr
<nsIDOMNode
> next
;
196 aNode
->GetNextSibling(getter_AddRefs(next
));
199 aNode
->GetParentNode(getter_AddRefs(next
));
200 if (next
== aRoot
|| !next
) {
204 aNode
->GetNextSibling(getter_AddRefs(next
));
209 while (checkNode
&& !IsTextNode(checkNode
)) {
210 checkNode
= FindNextNode(checkNode
, aRoot
);
215 // mozInlineSpellWordUtil::SetEnd
217 // We have two ranges "hard" and "soft". The hard boundary is simply
218 // the scope of the root node. The soft boundary is that which is set
219 // by the caller of this class by calling this function. If this function is
220 // not called, the soft boundary is the same as the hard boundary.
222 // When we reach the soft boundary (mSoftEnd), we keep
223 // going until we reach the end of a word. This allows the caller to set the
224 // end of the range to anything, and we will always check whole multiples of
225 // words. When we reach the hard boundary we stop no matter what.
227 // There is no beginning soft boundary. This is because we only go to the
228 // previous node once, when finding the previous word boundary in
229 // SetPosition(). You might think of the soft boundary as being this initial
233 mozInlineSpellWordUtil::SetEnd(nsIDOMNode
* aEndNode
, PRInt32 aEndOffset
)
235 NS_PRECONDITION(aEndNode
, "Null end node?");
237 NS_ASSERTION(mRootNode
, "Not initialized");
241 if (!IsTextNode(aEndNode
)) {
242 // End at the start of the first text node after aEndNode/aEndOffset.
243 aEndNode
= FindNextTextNode(aEndNode
, aEndOffset
, mRootNode
);
246 mSoftEnd
= NodeOffset(aEndNode
, aEndOffset
);
251 mozInlineSpellWordUtil::SetPosition(nsIDOMNode
* aNode
, PRInt32 aOffset
)
255 if (!IsTextNode(aNode
)) {
256 // Start at the start of the first text node after aNode/aOffset.
257 aNode
= FindNextTextNode(aNode
, aOffset
, mRootNode
);
260 mSoftBegin
= NodeOffset(aNode
, aOffset
);
264 PRInt32 textOffset
= MapDOMPositionToSoftTextOffset(mSoftBegin
);
267 mNextWordIndex
= FindRealWordContaining(textOffset
, HINT_END
, PR_TRUE
);
272 mozInlineSpellWordUtil::EnsureWords()
278 mSoftTextValid
= PR_TRUE
;
282 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord
& aWord
, nsIDOMRange
** aRange
)
284 NodeOffset begin
= MapSoftTextOffsetToDOMPosition(aWord
.mSoftTextOffset
, HINT_BEGIN
);
285 NodeOffset end
= MapSoftTextOffsetToDOMPosition(aWord
.EndOffset(), HINT_END
);
286 return MakeRange(begin
, end
, aRange
);
289 // mozInlineSpellWordUtil::GetRangeForWord
292 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode
* aWordNode
,
294 nsIDOMRange
** aRange
)
296 // Set our soft end and start
297 NodeOffset pt
= NodeOffset(aWordNode
, aWordOffset
);
300 mSoftBegin
= mSoftEnd
= pt
;
303 PRInt32 offset
= MapDOMPositionToSoftTextOffset(pt
);
305 return MakeRange(pt
, pt
, aRange
);
306 PRInt32 wordIndex
= FindRealWordContaining(offset
, HINT_BEGIN
, PR_FALSE
);
308 return MakeRange(pt
, pt
, aRange
);
309 return MakeRangeForWord(mRealWords
[wordIndex
], aRange
);
312 // This is to fix characters that the spellchecker may not like
314 NormalizeWord(const nsSubstring
& aInput
, PRInt32 aPos
, PRInt32 aLen
, nsAString
& aOutput
)
317 for (PRInt32 i
= 0; i
< aLen
; i
++) {
318 PRUnichar ch
= aInput
.CharAt(i
+ aPos
);
320 // remove ignorable characters from the word
321 if (IsIgnorableCharacter(ch
))
324 // the spellchecker doesn't handle curly apostrophes in all languages
325 if (ch
== 0x2019) { // RIGHT SINGLE QUOTATION MARK
333 // mozInlineSpellWordUtil::GetNextWord
335 // FIXME-optimization: we shouldn't have to generate a range every single
336 // time. It would be better if the inline spellchecker didn't require a
337 // range unless the word was misspelled. This may or may not be possible.
340 mozInlineSpellWordUtil::GetNextWord(nsAString
& aText
, nsIDOMRange
** aRange
,
341 PRBool
* aSkipChecking
)
343 #ifdef DEBUG_SPELLCHECK
344 printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex
);
347 if (mNextWordIndex
< 0 ||
348 mNextWordIndex
>= PRInt32(mRealWords
.Length())) {
351 *aSkipChecking
= PR_TRUE
;
355 const RealWord
& word
= mRealWords
[mNextWordIndex
];
356 nsresult rv
= MakeRangeForWord(word
, aRange
);
357 NS_ENSURE_SUCCESS(rv
, rv
);
359 *aSkipChecking
= !word
.mCheckableWord
;
360 ::NormalizeWord(mSoftText
, word
.mSoftTextOffset
, word
.mLength
, aText
);
362 #ifdef DEBUG_SPELLCHECK
363 printf("GetNextWord returning: %s (skip=%d)\n",
364 NS_ConvertUTF16toUTF8(aText
).get(), *aSkipChecking
);
370 // mozInlineSpellWordUtil::MakeRange
372 // Convenience function for creating a range over the current document.
375 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin
, NodeOffset aEnd
,
376 nsIDOMRange
** aRange
)
378 if (! mDOMDocumentRange
)
379 return NS_ERROR_NOT_INITIALIZED
;
381 nsresult rv
= mDOMDocumentRange
->CreateRange(aRange
);
382 NS_ENSURE_SUCCESS(rv
, rv
);
384 rv
= (*aRange
)->SetStart(aBegin
.mNode
, aBegin
.mOffset
);
385 NS_ENSURE_SUCCESS(rv
, rv
);
386 rv
= (*aRange
)->SetEnd(aEnd
.mNode
, aEnd
.mOffset
);
387 NS_ENSURE_SUCCESS(rv
, rv
);
392 /*********** DOM text extraction ************/
394 // IsDOMWordSeparator
396 // Determines if the given character should be considered as a DOM Word
397 // separator. Basically, this is whitespace, although it could also have
398 // certain punctuation that we know ALWAYS breaks words. This is important.
399 // For example, we can't have any punctuation that could appear in a URL
400 // or email address in this, because those need to always fit into a single
404 IsDOMWordSeparator(PRUnichar ch
)
407 if (ch
== ' ' || ch
== '\t' || ch
== '\n' || ch
== '\r')
410 // complex spaces - check only if char isn't ASCII (uncommon)
412 (ch
== 0x00A0 || // NO-BREAK SPACE
413 ch
== 0x2002 || // EN SPACE
414 ch
== 0x2003 || // EM SPACE
415 ch
== 0x2009 || // THIN SPACE
416 ch
== 0x200C || // ZERO WIDTH NON-JOINER
417 ch
== 0x3000)) // IDEOGRAPHIC SPACE
420 // otherwise not a space
425 IsBRElement(nsIDOMNode
* aNode
)
428 nsCOMPtr
<nsIDOMHTMLBRElement
> elt
= do_QueryInterface(aNode
, &rv
);
429 return NS_SUCCEEDED(rv
);
433 GetNodeText(nsIDOMNode
* aNode
, nsAutoString
& aText
)
435 nsresult rv
= aNode
->GetNodeValue(aText
);
436 NS_ASSERTION(NS_SUCCEEDED(rv
), "Unable to get node text");
439 // Find the previous node in the DOM tree in preorder. This isn't fast because
440 // one call to GetPrevSibling can be O(N) in the number of siblings...
442 FindPrevNode(nsIDOMNode
* aNode
, nsIDOMNode
* aRoot
)
447 nsCOMPtr
<nsIDOMNode
> prev
;
448 aNode
->GetPreviousSibling(getter_AddRefs(prev
));
451 nsCOMPtr
<nsIDOMNode
> lastChild
;
452 prev
->GetLastChild(getter_AddRefs(lastChild
));
459 // No prev sibling. So we are the first child of our parent, if any. Our
460 // parent is our previous node.
461 aNode
->GetParentNode(getter_AddRefs(prev
));
466 * Check if there's a DOM word separator before aBeforeOffset in this node.
467 * Always returns PR_TRUE if it's a BR element.
468 * aSeparatorOffset is set to the index of the last separator if any is found
469 * (0 for BR elements).
472 ContainsDOMWordSeparator(nsIDOMNode
* aNode
, PRInt32 aBeforeOffset
,
473 PRInt32
* aSeparatorOffset
)
475 if (IsBRElement(aNode
)) {
476 *aSeparatorOffset
= 0;
480 if (!IsTextNode(aNode
))
484 GetNodeText(aNode
, str
);
485 for (PRInt32 i
= PR_MIN(aBeforeOffset
, PRInt32(str
.Length())) - 1; i
>= 0; --i
) {
486 if (IsDOMWordSeparator(str
.CharAt(i
))) {
487 *aSeparatorOffset
= i
;
495 IsBreakElement(nsIDOMViewCSS
* aDocView
, nsIDOMNode
* aNode
)
497 nsCOMPtr
<nsIDOMElement
> element
= do_QueryInterface(aNode
);
501 if (IsBRElement(aNode
))
504 nsCOMPtr
<nsIDOMCSSStyleDeclaration
> style
;
505 aDocView
->GetComputedStyle(element
, EmptyString(), getter_AddRefs(style
));
509 #ifdef DEBUG_SPELLCHECK
510 printf(" searching element %p\n", (void*)aNode
);
513 nsAutoString display
;
514 style
->GetPropertyValue(NS_LITERAL_STRING("display"), display
);
515 #ifdef DEBUG_SPELLCHECK
516 printf(" display=\"%s\"\n", NS_ConvertUTF16toUTF8(display
).get());
518 if (!display
.EqualsLiteral("inline"))
521 nsAutoString position
;
522 style
->GetPropertyValue(NS_LITERAL_STRING("position"), position
);
523 #ifdef DEBUG_SPELLCHECK
524 printf(" position=%s\n", NS_ConvertUTF16toUTF8(position
).get());
526 if (!position
.EqualsLiteral("static"))
529 // XXX What about floats? What else?
533 struct CheckLeavingBreakElementClosure
{
534 nsIDOMViewCSS
* mDocView
;
535 PRPackedBool mLeftBreakElement
;
539 CheckLeavingBreakElement(nsIDOMNode
* aNode
, void* aClosure
)
541 CheckLeavingBreakElementClosure
* cl
=
542 static_cast<CheckLeavingBreakElementClosure
*>(aClosure
);
543 if (!cl
->mLeftBreakElement
&& IsBreakElement(cl
->mDocView
, aNode
)) {
544 cl
->mLeftBreakElement
= PR_TRUE
;
549 mozInlineSpellWordUtil::NormalizeWord(nsSubstring
& aWord
)
552 ::NormalizeWord(aWord
, 0, aWord
.Length(), result
);
557 mozInlineSpellWordUtil::BuildSoftText()
559 // First we have to work backwards from mSoftStart to find a text node
560 // containing a DOM word separator, a non-inline-element
561 // boundary, or the hard start node. That's where we'll start building the
563 nsIDOMNode
* node
= mSoftBegin
.mNode
;
564 PRInt32 firstOffsetInNode
= 0;
565 PRInt32 checkBeforeOffset
= mSoftBegin
.mOffset
;
567 if (ContainsDOMWordSeparator(node
, checkBeforeOffset
, &firstOffsetInNode
))
569 checkBeforeOffset
= PR_INT32_MAX
;
570 if (IsBreakElement(mCSSView
, node
)) {
571 // Since FindPrevNode follows tree *preorder*, we're about to traverse
572 // up out of 'node'. Since node induces breaks (e.g., it's a block),
573 // don't bother trying to look outside it, just stop now.
576 node
= FindPrevNode(node
, mRootNode
);
579 // Now build up the string moving forward through the DOM until we reach
580 // the soft end and *then* see a DOM word separator, a non-inline-element
581 // boundary, or the hard end node.
582 mSoftText
.Truncate();
583 mSoftTextDOMMapping
.Clear();
584 PRBool seenSoftEnd
= PR_FALSE
;
585 // Leave this outside the loop so large heap string allocations can be reused
589 if (node
== mSoftEnd
.mNode
) {
590 seenSoftEnd
= PR_TRUE
;
593 PRBool exit
= PR_FALSE
;
594 if (IsTextNode(node
)) {
595 GetNodeText(node
, str
);
596 PRInt32 lastOffsetInNode
= str
.Length();
599 // check whether we can stop after this
600 for (PRInt32 i
= node
== mSoftEnd
.mNode
? mSoftEnd
.mOffset
: 0;
601 i
< PRInt32(str
.Length()); ++i
) {
602 if (IsDOMWordSeparator(str
.CharAt(i
))) {
604 // stop at the first separator after the soft end point
605 lastOffsetInNode
= i
;
611 if (firstOffsetInNode
< lastOffsetInNode
) {
612 PRInt32 len
= lastOffsetInNode
- firstOffsetInNode
;
613 mSoftTextDOMMapping
.AppendElement(
614 DOMTextMapping(NodeOffset(node
, firstOffsetInNode
), mSoftText
.Length(), len
));
615 mSoftText
.Append(Substring(str
, firstOffsetInNode
, len
));
618 firstOffsetInNode
= 0;
624 CheckLeavingBreakElementClosure closure
= { mCSSView
, PR_FALSE
};
625 node
= FindNextNode(node
, mRootNode
, CheckLeavingBreakElement
, &closure
);
626 if (closure
.mLeftBreakElement
|| (node
&& IsBreakElement(mCSSView
, node
))) {
627 // We left, or are entering, a break element (e.g., block). Maybe we can
632 mSoftText
.Append(' ');
636 #ifdef DEBUG_SPELLCHECK
637 printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText
).get());
642 mozInlineSpellWordUtil::BuildRealWords()
644 // This is pretty simple. We just have to walk mSoftText, tokenizing it
645 // into "real words".
646 // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
647 // SplitDOMWord on each of those DOM words
648 PRInt32 wordStart
= -1;
650 for (PRInt32 i
= 0; i
< PRInt32(mSoftText
.Length()); ++i
) {
651 if (IsDOMWordSeparator(mSoftText
.CharAt(i
))) {
652 if (wordStart
>= 0) {
653 SplitDOMWord(wordStart
, i
);
662 if (wordStart
>= 0) {
663 SplitDOMWord(wordStart
, mSoftText
.Length());
667 /*********** DOM/realwords<->mSoftText mapping functions ************/
670 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset
)
672 if (!mSoftTextValid
) {
673 NS_ERROR("Soft text must be valid if we're to map into it");
677 for (PRInt32 i
= 0; i
< PRInt32(mSoftTextDOMMapping
.Length()); ++i
) {
678 const DOMTextMapping
& map
= mSoftTextDOMMapping
[i
];
679 if (map
.mNodeOffset
.mNode
== aNodeOffset
.mNode
) {
680 // Allow offsets at either end of the string, in particular, allow the
681 // offset that's at the end of the contributed string
682 PRInt32 offsetInContributedString
=
683 aNodeOffset
.mOffset
- map
.mNodeOffset
.mOffset
;
684 if (offsetInContributedString
>= 0 &&
685 offsetInContributedString
<= map
.mLength
)
686 return map
.mSoftTextOffset
+ offsetInContributedString
;
693 mozInlineSpellWordUtil::NodeOffset
694 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(PRInt32 aSoftTextOffset
,
697 NS_ASSERTION(mSoftTextValid
, "Soft text must be valid if we're to map out of it");
699 return NodeOffset(nsnull
, -1);
701 // The invariant is that the range start..end includes the last mapping,
702 // if any, such that mSoftTextOffset <= aSoftTextOffset
704 PRInt32 end
= mSoftTextDOMMapping
.Length();
705 while (end
- start
>= 2) {
706 PRInt32 mid
= (start
+ end
)/2;
707 const DOMTextMapping
& map
= mSoftTextDOMMapping
[mid
];
708 if (map
.mSoftTextOffset
> aSoftTextOffset
) {
716 return NodeOffset(nsnull
, -1);
718 // 'start' is now the last mapping, if any, such that
719 // mSoftTextOffset <= aSoftTextOffset.
720 // If we're doing HINT_END, then we may want to return the end of the
721 // the previous mapping instead of the start of this mapping
722 if (aHint
== HINT_END
&& start
> 0) {
723 const DOMTextMapping
& map
= mSoftTextDOMMapping
[start
- 1];
724 if (map
.mSoftTextOffset
+ map
.mLength
== aSoftTextOffset
)
725 return NodeOffset(map
.mNodeOffset
.mNode
, map
.mNodeOffset
.mOffset
+ map
.mLength
);
728 // We allow ourselves to return the end of this mapping even if we're
729 // doing HINT_START. This will only happen if there is no mapping which this
730 // point is the start of. I'm not 100% sure this is OK...
731 const DOMTextMapping
& map
= mSoftTextDOMMapping
[start
];
732 PRInt32 offset
= aSoftTextOffset
- map
.mSoftTextOffset
;
733 if (offset
>= 0 && offset
<= map
.mLength
)
734 return NodeOffset(map
.mNodeOffset
.mNode
, map
.mNodeOffset
.mOffset
+ offset
);
736 return NodeOffset(nsnull
, -1);
740 mozInlineSpellWordUtil::FindRealWordContaining(PRInt32 aSoftTextOffset
,
741 DOMMapHint aHint
, PRBool aSearchForward
)
743 NS_ASSERTION(mSoftTextValid
, "Soft text must be valid if we're to map out of it");
747 // The invariant is that the range start..end includes the last word,
748 // if any, such that mSoftTextOffset <= aSoftTextOffset
750 PRInt32 end
= mRealWords
.Length();
751 while (end
- start
>= 2) {
752 PRInt32 mid
= (start
+ end
)/2;
753 const RealWord
& word
= mRealWords
[mid
];
754 if (word
.mSoftTextOffset
> aSoftTextOffset
) {
764 // 'start' is now the last word, if any, such that
765 // mSoftTextOffset <= aSoftTextOffset.
766 // If we're doing HINT_END, then we may want to return the end of the
767 // the previous word instead of the start of this word
768 if (aHint
== HINT_END
&& start
> 0) {
769 const RealWord
& word
= mRealWords
[start
- 1];
770 if (word
.mSoftTextOffset
+ word
.mLength
== aSoftTextOffset
)
774 // We allow ourselves to return the end of this word even if we're
775 // doing HINT_START. This will only happen if there is no word which this
776 // point is the start of. I'm not 100% sure this is OK...
777 const RealWord
& word
= mRealWords
[start
];
778 PRInt32 offset
= aSoftTextOffset
- word
.mSoftTextOffset
;
779 if (offset
>= 0 && offset
<= word
.mLength
)
782 if (aSearchForward
) {
783 if (mRealWords
[0].mSoftTextOffset
> aSoftTextOffset
) {
784 // All words have mSoftTextOffset > aSoftTextOffset
787 // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.
788 // Word start+1, if it exists, will be the first with
789 // mSoftTextOffset > aSoftTextOffset.
790 if (start
+ 1 < PRInt32(mRealWords
.Length()))
797 /*********** Word Splitting ************/
799 // classifies a given character in the DOM word
802 CHAR_CLASS_SEPARATOR
,
803 CHAR_CLASS_END_OF_INPUT
};
805 // Encapsulates DOM-word to real-word splitting
806 struct WordSplitState
808 mozInlineSpellWordUtil
* mWordUtil
;
809 const nsDependentSubstring mDOMWordText
;
810 PRInt32 mDOMWordOffset
;
811 CharClass mCurCharClass
;
813 WordSplitState(mozInlineSpellWordUtil
* aWordUtil
,
814 const nsString
& aString
, PRInt32 aStart
, PRInt32 aLen
)
815 : mWordUtil(aWordUtil
), mDOMWordText(aString
, aStart
, aLen
),
816 mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT
) {}
818 CharClass
ClassifyCharacter(PRInt32 aIndex
, PRBool aRecurse
) const;
820 void AdvanceThroughSeparators();
821 void AdvanceThroughWord();
823 // Finds special words like email addresses and URLs that may start at the
824 // current position, and returns their length, or 0 if not found. This allows
825 // arbitrary word breaking rules to be used for these special entities, as
826 // long as they can not contain whitespace.
827 PRInt32
FindSpecialWord();
829 // Similar to FindSpecialWord except that this takes a split word as
830 // input. This checks for things that do not require special word-breaking
832 PRBool
ShouldSkipWord(PRInt32 aStart
, PRInt32 aLength
);
835 // WordSplitState::ClassifyCharacter
838 WordSplitState::ClassifyCharacter(PRInt32 aIndex
, PRBool aRecurse
) const
840 NS_ASSERTION(aIndex
>= 0 && aIndex
<= PRInt32(mDOMWordText
.Length()),
841 "Index out of range");
842 if (aIndex
== PRInt32(mDOMWordText
.Length()))
843 return CHAR_CLASS_SEPARATOR
;
845 // this will classify the character, we want to treat "ignorable" characters
846 // such as soft hyphens as word characters.
847 nsIUGenCategory::nsUGenCategory
848 charCategory
= mWordUtil
->GetCategories()->Get(PRUint32(mDOMWordText
[aIndex
]));
849 if (charCategory
== nsIUGenCategory::kLetter
||
850 IsIgnorableCharacter(mDOMWordText
[aIndex
]))
851 return CHAR_CLASS_WORD
;
853 // If conditional punctuation is surrounded immediately on both sides by word
854 // characters it also counts as a word character.
855 if (IsConditionalPunctuation(mDOMWordText
[aIndex
])) {
857 // not allowed to look around, this punctuation counts like a separator
858 return CHAR_CLASS_SEPARATOR
;
861 // check the left-hand character
863 return CHAR_CLASS_SEPARATOR
;
864 if (ClassifyCharacter(aIndex
- 1, false) != CHAR_CLASS_WORD
)
865 return CHAR_CLASS_SEPARATOR
;
867 // now we know left char is a word-char, check the right-hand character
868 if (aIndex
== PRInt32(mDOMWordText
.Length()) - 1)
869 return CHAR_CLASS_SEPARATOR
;
870 if (ClassifyCharacter(aIndex
+ 1, false) != CHAR_CLASS_WORD
)
871 return CHAR_CLASS_SEPARATOR
;
873 // char on either side is a word, this counts as a word
874 return CHAR_CLASS_WORD
;
877 // all other punctuation
878 if (charCategory
== nsIUGenCategory::kSeparator
||
879 charCategory
== nsIUGenCategory::kOther
||
880 charCategory
== nsIUGenCategory::kPunctuation
||
881 charCategory
== nsIUGenCategory::kSymbol
)
882 return CHAR_CLASS_SEPARATOR
;
884 // any other character counts as a word
885 return CHAR_CLASS_WORD
;
889 // WordSplitState::Advance
892 WordSplitState::Advance()
894 NS_ASSERTION(mDOMWordOffset
>= 0, "Negative word index");
895 NS_ASSERTION(mDOMWordOffset
< (PRInt32
)mDOMWordText
.Length(),
896 "Length beyond end");
899 if (mDOMWordOffset
>= (PRInt32
)mDOMWordText
.Length())
900 mCurCharClass
= CHAR_CLASS_END_OF_INPUT
;
902 mCurCharClass
= ClassifyCharacter(mDOMWordOffset
, PR_TRUE
);
906 // WordSplitState::AdvanceThroughSeparators
909 WordSplitState::AdvanceThroughSeparators()
911 while (mCurCharClass
== CHAR_CLASS_SEPARATOR
)
915 // WordSplitState::AdvanceThroughWord
918 WordSplitState::AdvanceThroughWord()
920 while (mCurCharClass
== CHAR_CLASS_WORD
)
925 // WordSplitState::FindSpecialWord
928 WordSplitState::FindSpecialWord()
932 // Search for email addresses. We simply define these as any sequence of
933 // characters with an '@' character in the middle. The DOM word is already
934 // split on whitepace, so we know that everything to the end is the address
936 // Also look for periods, this tells us if we want to run the URL finder.
937 PRBool foundDot
= PR_FALSE
;
938 PRInt32 firstColon
= -1;
939 for (i
= mDOMWordOffset
;
940 i
< PRInt32(mDOMWordText
.Length()); i
++) {
941 if (mDOMWordText
[i
] == '@') {
942 // only accept this if there are unambiguous word characters (don't bother
943 // recursing to disambiguate apostrophes) on each side. This prevents
944 // classifying, e.g. "@home" as an email address
946 // Use this condition to only accept words with '@' in the middle of
947 // them. It works, but the inlinespellcker doesn't like this. The problem
948 // is that you type "fhsgfh@" that's a misspelled word followed by a
949 // symbol, but when you type another letter "fhsgfh@g" that first word
950 // need to be unmarked misspelled. It doesn't do this. it only checks the
951 // current position for potentially removing a spelling range.
952 if (i
> 0 && ClassifyCharacter(i
- 1, PR_FALSE
) == CHAR_CLASS_WORD
&&
953 i
< (PRInt32
)mDOMWordText
.Length() - 1 &&
954 ClassifyCharacter(i
+ 1, PR_FALSE
) == CHAR_CLASS_WORD
)
956 return mDOMWordText
.Length() - mDOMWordOffset
;
957 } else if (mDOMWordText
[i
] == '.' && ! foundDot
&&
958 i
> 0 && i
< (PRInt32
)mDOMWordText
.Length() - 1) {
959 // we found a period not at the end, we should check harder for URLs
961 } else if (mDOMWordText
[i
] == ':' && firstColon
< 0) {
966 // If the first colon is followed by a slash, consider it a URL
967 // This will catch things like asdf://foo.com
968 if (firstColon
>= 0 && firstColon
< (PRInt32
)mDOMWordText
.Length() - 1 &&
969 mDOMWordText
[firstColon
+ 1] == '/') {
970 return mDOMWordText
.Length() - mDOMWordOffset
;
973 // Check the text before the first colon against some known protocols. It
974 // is impossible to check against all protocols, especially since you can
975 // plug in new protocols. We also don't want to waste time here checking
976 // against a lot of obscure protocols.
977 if (firstColon
> mDOMWordOffset
) {
978 nsString
protocol(Substring(mDOMWordText
, mDOMWordOffset
,
979 firstColon
- mDOMWordOffset
));
980 if (protocol
.EqualsIgnoreCase("http") ||
981 protocol
.EqualsIgnoreCase("https") ||
982 protocol
.EqualsIgnoreCase("news") ||
983 protocol
.EqualsIgnoreCase("ftp") ||
984 protocol
.EqualsIgnoreCase("file") ||
985 protocol
.EqualsIgnoreCase("javascript") ||
986 protocol
.EqualsIgnoreCase("ftp")) {
987 return mDOMWordText
.Length() - mDOMWordOffset
;
991 // not anything special
995 // WordSplitState::ShouldSkipWord
998 WordSplitState::ShouldSkipWord(PRInt32 aStart
, PRInt32 aLength
)
1000 PRInt32 last
= aStart
+ aLength
;
1002 // check to see if the word contains a digit
1003 for (PRInt32 i
= aStart
; i
< last
; i
++) {
1004 PRUnichar ch
= mDOMWordText
[i
];
1005 // XXX Shouldn't this be something a lot more complex, Unicode-based?
1006 if (ch
>= '0' && ch
<= '9')
1014 // mozInlineSpellWordUtil::SplitDOMWord
1017 mozInlineSpellWordUtil::SplitDOMWord(PRInt32 aStart
, PRInt32 aEnd
)
1019 WordSplitState
state(this, mSoftText
, aStart
, aEnd
- aStart
);
1020 state
.mCurCharClass
= state
.ClassifyCharacter(0, PR_TRUE
);
1022 while (state
.mCurCharClass
!= CHAR_CLASS_END_OF_INPUT
) {
1023 state
.AdvanceThroughSeparators();
1024 if (state
.mCurCharClass
== CHAR_CLASS_END_OF_INPUT
)
1027 PRInt32 specialWordLength
= state
.FindSpecialWord();
1028 if (specialWordLength
> 0) {
1029 mRealWords
.AppendElement(
1030 RealWord(aStart
+ state
.mDOMWordOffset
, specialWordLength
, PR_FALSE
));
1032 // skip the special word
1033 state
.mDOMWordOffset
+= specialWordLength
;
1034 if (state
.mDOMWordOffset
+ aStart
>= aEnd
)
1035 state
.mCurCharClass
= CHAR_CLASS_END_OF_INPUT
;
1037 state
.mCurCharClass
= state
.ClassifyCharacter(state
.mDOMWordOffset
, PR_TRUE
);
1041 // save the beginning of the word
1042 PRInt32 wordOffset
= state
.mDOMWordOffset
;
1044 // find the end of the word
1045 state
.AdvanceThroughWord();
1046 PRInt32 wordLen
= state
.mDOMWordOffset
- wordOffset
;
1047 mRealWords
.AppendElement(
1048 RealWord(aStart
+ wordOffset
, wordLen
,
1049 !state
.ShouldSkipWord(wordOffset
, wordLen
)));