Bug 470455 - test_database_sync_embed_visits.js leaks, r=sdwilsh
[wine-gecko.git] / extensions / spellcheck / src / mozInlineSpellWordUtil.cpp
blob53e90dc1e2b3cd69a79501ce2be6b7efdca3f7b9
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
15 * The Original Code is inline spellchecker code.
17 * The Initial Developer of the Original Code is Google Inc.
18 * Portions created by the Initial Developer are Copyright (C) 2004-2006
19 * the Initial Developer. All Rights Reserved.
21 * Contributor(s):
22 * Brett Wilson <brettw@gmail.com> (original author)
23 * Robert O'Callahan <rocallahan@novell.com>
25 * Alternatively, the contents of this file may be used under the terms of
26 * either the GNU General Public License Version 2 or later (the "GPL"), or
27 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28 * in which case the provisions of the GPL or the LGPL are applicable instead
29 * of those above. If you wish to allow use of your version of this file only
30 * under the terms of either the GPL or the LGPL, and not to allow others to
31 * use your version of this file under the terms of the MPL, indicate your
32 * decision by deleting the provisions above and replace them with the notice
33 * and other provisions required by the GPL or the LGPL. If you do not delete
34 * the provisions above, a recipient may use your version of this file under
35 * the terms of any one of the MPL, the GPL or the LGPL.
37 * ***** END LICENSE BLOCK ***** */
39 #include "mozInlineSpellWordUtil.h"
40 #include "nsDebug.h"
41 #include "nsIAtom.h"
42 #include "nsComponentManagerUtils.h"
43 #include "nsIDOMCSSStyleDeclaration.h"
44 #include "nsIDOMDocumentView.h"
45 #include "nsIDOMElement.h"
46 #include "nsIDOMNSRange.h"
47 #include "nsIDOMRange.h"
48 #include "nsIEditor.h"
49 #include "nsIDOMNode.h"
50 #include "nsIDOMHTMLBRElement.h"
51 #include "nsUnicharUtilCIID.h"
52 #include "nsServiceManagerUtils.h"
54 // IsIgnorableCharacter
56 // These characters are ones that we should ignore in input.
58 inline PRBool IsIgnorableCharacter(PRUnichar ch)
60 return (ch == 0x200D || // ZERO-WIDTH JOINER
61 ch == 0xAD || // SOFT HYPHEN
62 ch == 0x1806); // MONGOLIAN TODO SOFT HYPHEN
65 // IsConditionalPunctuation
67 // Some characters (like apostrophes) require characters on each side to be
68 // part of a word, and are otherwise punctuation.
70 inline PRBool IsConditionalPunctuation(PRUnichar ch)
72 return (ch == '\'' ||
73 ch == 0x2019); // RIGHT SINGLE QUOTATION MARK
76 // mozInlineSpellWordUtil::Init
78 nsresult
79 mozInlineSpellWordUtil::Init(nsWeakPtr aWeakEditor)
81 nsresult rv;
83 mCategories = do_GetService(NS_UNICHARCATEGORY_CONTRACTID, &rv);
84 if (NS_FAILED(rv))
85 return rv;
87 // getting the editor can fail commonly because the editor was detached, so
88 // don't assert
89 nsCOMPtr<nsIEditor> editor = do_QueryReferent(aWeakEditor, &rv);
90 if (NS_FAILED(rv))
91 return rv;
93 nsCOMPtr<nsIDOMDocument> domDoc;
94 rv = editor->GetDocument(getter_AddRefs(domDoc));
95 NS_ENSURE_SUCCESS(rv, rv);
97 mDocument = do_QueryInterface(domDoc, &rv);
98 NS_ENSURE_SUCCESS(rv, rv);
100 mDOMDocumentRange = do_QueryInterface(domDoc, &rv);
101 NS_ENSURE_SUCCESS(rv, rv);
103 // view
104 nsCOMPtr<nsIDOMDocumentView> docView = do_QueryInterface(domDoc, &rv);
105 NS_ENSURE_SUCCESS(rv, rv);
106 nsCOMPtr<nsIDOMAbstractView> abstractView;
107 rv = docView->GetDefaultView(getter_AddRefs(abstractView));
108 NS_ENSURE_SUCCESS(rv, rv);
109 mCSSView = do_QueryInterface(abstractView, &rv);
110 NS_ENSURE_SUCCESS(rv, rv);
112 // Find the root node for the editor. For contenteditable we'll need something
113 // cleverer here.
114 nsCOMPtr<nsIDOMElement> rootElt;
115 rv = editor->GetRootElement(getter_AddRefs(rootElt));
116 NS_ENSURE_SUCCESS(rv, rv);
118 mRootNode = rootElt;
119 NS_ASSERTION(mRootNode, "GetRootElement returned null *and* claimed to suceed!");
120 return NS_OK;
123 static PRBool
124 IsTextNode(nsIDOMNode* aNode)
126 PRUint16 type = 0;
127 aNode->GetNodeType(&type);
128 return type == nsIDOMNode::TEXT_NODE;
131 typedef void (* OnLeaveNodeFunPtr)(nsIDOMNode* aNode, void* aClosure);
133 // Find the next node in the DOM tree in preorder. This isn't fast because
134 // one call to GetNextSibling can be O(N) in the number of siblings...
135 // Calls OnLeaveNodeFunPtr when the traversal leaves a node
136 static nsIDOMNode*
137 FindNextNode(nsIDOMNode* aNode, nsIDOMNode* aRoot,
138 OnLeaveNodeFunPtr aOnLeaveNode = nsnull, void* aClosure = nsnull)
140 NS_PRECONDITION(aNode, "Null starting node?");
142 nsCOMPtr<nsIDOMNode> next;
143 aNode->GetFirstChild(getter_AddRefs(next));
144 if (next)
145 return next;
147 // Don't look at siblings or otherwise outside of aRoot
148 if (aNode == aRoot)
149 return nsnull;
151 aNode->GetNextSibling(getter_AddRefs(next));
152 if (next)
153 return next;
155 // Go up
156 for (;;) {
157 if (aOnLeaveNode) {
158 aOnLeaveNode(aNode, aClosure);
161 aNode->GetParentNode(getter_AddRefs(next));
162 if (next == aRoot || ! next)
163 return nsnull;
164 aNode = next;
166 aNode->GetNextSibling(getter_AddRefs(next));
167 if (next)
168 return next;
172 // aNode is not a text node. Find the first text node starting at aNode/aOffset
173 // in a preorder DOM traversal.
174 static nsIDOMNode*
175 FindNextTextNode(nsIDOMNode* aNode, PRInt32 aOffset, nsIDOMNode* aRoot)
177 NS_PRECONDITION(aNode, "Null starting node?");
178 NS_ASSERTION(!IsTextNode(aNode), "FindNextTextNode should start with a non-text node");
180 nsIDOMNode* checkNode;
181 // Need to start at the aOffset'th child
182 nsCOMPtr<nsIDOMNode> child;
183 aNode->GetFirstChild(getter_AddRefs(child));
184 while (child && aOffset > 0) {
185 nsCOMPtr<nsIDOMNode> next;
186 child->GetNextSibling(getter_AddRefs(next));
187 child.swap(next);
188 --aOffset;
190 if (child) {
191 checkNode = child;
192 } else {
193 // aOffset was beyond the end of the child list.
194 // goto next node in a preorder DOM traversal.
195 nsCOMPtr<nsIDOMNode> next;
196 aNode->GetNextSibling(getter_AddRefs(next));
197 while (!next) {
198 // Go up
199 aNode->GetParentNode(getter_AddRefs(next));
200 if (next == aRoot || !next) {
201 return nsnull;
203 aNode = next;
204 aNode->GetNextSibling(getter_AddRefs(next));
206 checkNode = next;
209 while (checkNode && !IsTextNode(checkNode)) {
210 checkNode = FindNextNode(checkNode, aRoot);
212 return checkNode;
215 // mozInlineSpellWordUtil::SetEnd
217 // We have two ranges "hard" and "soft". The hard boundary is simply
218 // the scope of the root node. The soft boundary is that which is set
219 // by the caller of this class by calling this function. If this function is
220 // not called, the soft boundary is the same as the hard boundary.
222 // When we reach the soft boundary (mSoftEnd), we keep
223 // going until we reach the end of a word. This allows the caller to set the
224 // end of the range to anything, and we will always check whole multiples of
225 // words. When we reach the hard boundary we stop no matter what.
227 // There is no beginning soft boundary. This is because we only go to the
228 // previous node once, when finding the previous word boundary in
229 // SetPosition(). You might think of the soft boundary as being this initial
230 // position.
232 nsresult
233 mozInlineSpellWordUtil::SetEnd(nsIDOMNode* aEndNode, PRInt32 aEndOffset)
235 NS_PRECONDITION(aEndNode, "Null end node?");
237 NS_ASSERTION(mRootNode, "Not initialized");
239 InvalidateWords();
241 if (!IsTextNode(aEndNode)) {
242 // End at the start of the first text node after aEndNode/aEndOffset.
243 aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);
244 aEndOffset = 0;
246 mSoftEnd = NodeOffset(aEndNode, aEndOffset);
247 return NS_OK;
250 nsresult
251 mozInlineSpellWordUtil::SetPosition(nsIDOMNode* aNode, PRInt32 aOffset)
253 InvalidateWords();
255 if (!IsTextNode(aNode)) {
256 // Start at the start of the first text node after aNode/aOffset.
257 aNode = FindNextTextNode(aNode, aOffset, mRootNode);
258 aOffset = 0;
260 mSoftBegin = NodeOffset(aNode, aOffset);
262 EnsureWords();
264 PRInt32 textOffset = MapDOMPositionToSoftTextOffset(mSoftBegin);
265 if (textOffset < 0)
266 return NS_OK;
267 mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, PR_TRUE);
268 return NS_OK;
271 void
272 mozInlineSpellWordUtil::EnsureWords()
274 if (mSoftTextValid)
275 return;
276 BuildSoftText();
277 BuildRealWords();
278 mSoftTextValid = PR_TRUE;
281 nsresult
282 mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord, nsIDOMRange** aRange)
284 NodeOffset begin = MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);
285 NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);
286 return MakeRange(begin, end, aRange);
289 // mozInlineSpellWordUtil::GetRangeForWord
291 nsresult
292 mozInlineSpellWordUtil::GetRangeForWord(nsIDOMNode* aWordNode,
293 PRInt32 aWordOffset,
294 nsIDOMRange** aRange)
296 // Set our soft end and start
297 NodeOffset pt = NodeOffset(aWordNode, aWordOffset);
299 InvalidateWords();
300 mSoftBegin = mSoftEnd = pt;
301 EnsureWords();
303 PRInt32 offset = MapDOMPositionToSoftTextOffset(pt);
304 if (offset < 0)
305 return MakeRange(pt, pt, aRange);
306 PRInt32 wordIndex = FindRealWordContaining(offset, HINT_BEGIN, PR_FALSE);
307 if (wordIndex < 0)
308 return MakeRange(pt, pt, aRange);
309 return MakeRangeForWord(mRealWords[wordIndex], aRange);
312 // This is to fix characters that the spellchecker may not like
313 static void
314 NormalizeWord(const nsSubstring& aInput, PRInt32 aPos, PRInt32 aLen, nsAString& aOutput)
316 aOutput.Truncate();
317 for (PRInt32 i = 0; i < aLen; i++) {
318 PRUnichar ch = aInput.CharAt(i + aPos);
320 // remove ignorable characters from the word
321 if (IsIgnorableCharacter(ch))
322 continue;
324 // the spellchecker doesn't handle curly apostrophes in all languages
325 if (ch == 0x2019) { // RIGHT SINGLE QUOTATION MARK
326 ch = '\'';
329 aOutput.Append(ch);
333 // mozInlineSpellWordUtil::GetNextWord
335 // FIXME-optimization: we shouldn't have to generate a range every single
336 // time. It would be better if the inline spellchecker didn't require a
337 // range unless the word was misspelled. This may or may not be possible.
339 nsresult
340 mozInlineSpellWordUtil::GetNextWord(nsAString& aText, nsIDOMRange** aRange,
341 PRBool* aSkipChecking)
343 #ifdef DEBUG_SPELLCHECK
344 printf("GetNextWord called; mNextWordIndex=%d\n", mNextWordIndex);
345 #endif
347 if (mNextWordIndex < 0 ||
348 mNextWordIndex >= PRInt32(mRealWords.Length())) {
349 mNextWordIndex = -1;
350 *aRange = nsnull;
351 *aSkipChecking = PR_TRUE;
352 return NS_OK;
355 const RealWord& word = mRealWords[mNextWordIndex];
356 nsresult rv = MakeRangeForWord(word, aRange);
357 NS_ENSURE_SUCCESS(rv, rv);
358 ++mNextWordIndex;
359 *aSkipChecking = !word.mCheckableWord;
360 ::NormalizeWord(mSoftText, word.mSoftTextOffset, word.mLength, aText);
362 #ifdef DEBUG_SPELLCHECK
363 printf("GetNextWord returning: %s (skip=%d)\n",
364 NS_ConvertUTF16toUTF8(aText).get(), *aSkipChecking);
365 #endif
367 return NS_OK;
370 // mozInlineSpellWordUtil::MakeRange
372 // Convenience function for creating a range over the current document.
374 nsresult
375 mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,
376 nsIDOMRange** aRange)
378 if (! mDOMDocumentRange)
379 return NS_ERROR_NOT_INITIALIZED;
381 nsresult rv = mDOMDocumentRange->CreateRange(aRange);
382 NS_ENSURE_SUCCESS(rv, rv);
384 rv = (*aRange)->SetStart(aBegin.mNode, aBegin.mOffset);
385 NS_ENSURE_SUCCESS(rv, rv);
386 rv = (*aRange)->SetEnd(aEnd.mNode, aEnd.mOffset);
387 NS_ENSURE_SUCCESS(rv, rv);
389 return NS_OK;
392 /*********** DOM text extraction ************/
394 // IsDOMWordSeparator
396 // Determines if the given character should be considered as a DOM Word
397 // separator. Basically, this is whitespace, although it could also have
398 // certain punctuation that we know ALWAYS breaks words. This is important.
399 // For example, we can't have any punctuation that could appear in a URL
400 // or email address in this, because those need to always fit into a single
401 // DOM word.
403 static PRBool
404 IsDOMWordSeparator(PRUnichar ch)
406 // simple spaces
407 if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')
408 return PR_TRUE;
410 // complex spaces - check only if char isn't ASCII (uncommon)
411 if (ch >= 0xA0 &&
412 (ch == 0x00A0 || // NO-BREAK SPACE
413 ch == 0x2002 || // EN SPACE
414 ch == 0x2003 || // EM SPACE
415 ch == 0x2009 || // THIN SPACE
416 ch == 0x200C || // ZERO WIDTH NON-JOINER
417 ch == 0x3000)) // IDEOGRAPHIC SPACE
418 return PR_TRUE;
420 // otherwise not a space
421 return PR_FALSE;
424 static PRBool
425 IsBRElement(nsIDOMNode* aNode)
427 nsresult rv;
428 nsCOMPtr<nsIDOMHTMLBRElement> elt = do_QueryInterface(aNode, &rv);
429 return NS_SUCCEEDED(rv);
432 static void
433 GetNodeText(nsIDOMNode* aNode, nsAutoString& aText)
435 nsresult rv = aNode->GetNodeValue(aText);
436 NS_ASSERTION(NS_SUCCEEDED(rv), "Unable to get node text");
439 // Find the previous node in the DOM tree in preorder. This isn't fast because
440 // one call to GetPrevSibling can be O(N) in the number of siblings...
441 static nsIDOMNode*
442 FindPrevNode(nsIDOMNode* aNode, nsIDOMNode* aRoot)
444 if (aNode == aRoot)
445 return nsnull;
447 nsCOMPtr<nsIDOMNode> prev;
448 aNode->GetPreviousSibling(getter_AddRefs(prev));
449 if (prev) {
450 for (;;) {
451 nsCOMPtr<nsIDOMNode> lastChild;
452 prev->GetLastChild(getter_AddRefs(lastChild));
453 if (!lastChild)
454 return prev;
455 prev = lastChild;
459 // No prev sibling. So we are the first child of our parent, if any. Our
460 // parent is our previous node.
461 aNode->GetParentNode(getter_AddRefs(prev));
462 return prev;
466 * Check if there's a DOM word separator before aBeforeOffset in this node.
467 * Always returns PR_TRUE if it's a BR element.
468 * aSeparatorOffset is set to the index of the last separator if any is found
469 * (0 for BR elements).
471 static PRBool
472 ContainsDOMWordSeparator(nsIDOMNode* aNode, PRInt32 aBeforeOffset,
473 PRInt32* aSeparatorOffset)
475 if (IsBRElement(aNode)) {
476 *aSeparatorOffset = 0;
477 return PR_TRUE;
480 if (!IsTextNode(aNode))
481 return PR_FALSE;
483 nsAutoString str;
484 GetNodeText(aNode, str);
485 for (PRInt32 i = PR_MIN(aBeforeOffset, PRInt32(str.Length())) - 1; i >= 0; --i) {
486 if (IsDOMWordSeparator(str.CharAt(i))) {
487 *aSeparatorOffset = i;
488 return PR_TRUE;
491 return PR_FALSE;
494 static PRBool
495 IsBreakElement(nsIDOMViewCSS* aDocView, nsIDOMNode* aNode)
497 nsCOMPtr<nsIDOMElement> element = do_QueryInterface(aNode);
498 if (!element)
499 return PR_FALSE;
501 if (IsBRElement(aNode))
502 return PR_TRUE;
504 nsCOMPtr<nsIDOMCSSStyleDeclaration> style;
505 aDocView->GetComputedStyle(element, EmptyString(), getter_AddRefs(style));
506 if (!style)
507 return PR_FALSE;
509 #ifdef DEBUG_SPELLCHECK
510 printf(" searching element %p\n", (void*)aNode);
511 #endif
513 nsAutoString display;
514 style->GetPropertyValue(NS_LITERAL_STRING("display"), display);
515 #ifdef DEBUG_SPELLCHECK
516 printf(" display=\"%s\"\n", NS_ConvertUTF16toUTF8(display).get());
517 #endif
518 if (!display.EqualsLiteral("inline"))
519 return PR_TRUE;
521 nsAutoString position;
522 style->GetPropertyValue(NS_LITERAL_STRING("position"), position);
523 #ifdef DEBUG_SPELLCHECK
524 printf(" position=%s\n", NS_ConvertUTF16toUTF8(position).get());
525 #endif
526 if (!position.EqualsLiteral("static"))
527 return PR_TRUE;
529 // XXX What about floats? What else?
530 return PR_FALSE;
533 struct CheckLeavingBreakElementClosure {
534 nsIDOMViewCSS* mDocView;
535 PRPackedBool mLeftBreakElement;
538 static void
539 CheckLeavingBreakElement(nsIDOMNode* aNode, void* aClosure)
541 CheckLeavingBreakElementClosure* cl =
542 static_cast<CheckLeavingBreakElementClosure*>(aClosure);
543 if (!cl->mLeftBreakElement && IsBreakElement(cl->mDocView, aNode)) {
544 cl->mLeftBreakElement = PR_TRUE;
548 void
549 mozInlineSpellWordUtil::NormalizeWord(nsSubstring& aWord)
551 nsAutoString result;
552 ::NormalizeWord(aWord, 0, aWord.Length(), result);
553 aWord = result;
556 void
557 mozInlineSpellWordUtil::BuildSoftText()
559 // First we have to work backwards from mSoftStart to find a text node
560 // containing a DOM word separator, a non-inline-element
561 // boundary, or the hard start node. That's where we'll start building the
562 // soft string from.
563 nsIDOMNode* node = mSoftBegin.mNode;
564 PRInt32 firstOffsetInNode = 0;
565 PRInt32 checkBeforeOffset = mSoftBegin.mOffset;
566 while (node) {
567 if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode))
568 break;
569 checkBeforeOffset = PR_INT32_MAX;
570 if (IsBreakElement(mCSSView, node)) {
571 // Since FindPrevNode follows tree *preorder*, we're about to traverse
572 // up out of 'node'. Since node induces breaks (e.g., it's a block),
573 // don't bother trying to look outside it, just stop now.
574 break;
576 node = FindPrevNode(node, mRootNode);
579 // Now build up the string moving forward through the DOM until we reach
580 // the soft end and *then* see a DOM word separator, a non-inline-element
581 // boundary, or the hard end node.
582 mSoftText.Truncate();
583 mSoftTextDOMMapping.Clear();
584 PRBool seenSoftEnd = PR_FALSE;
585 // Leave this outside the loop so large heap string allocations can be reused
586 // across iterations
587 nsAutoString str;
588 while (node) {
589 if (node == mSoftEnd.mNode) {
590 seenSoftEnd = PR_TRUE;
593 PRBool exit = PR_FALSE;
594 if (IsTextNode(node)) {
595 GetNodeText(node, str);
596 PRInt32 lastOffsetInNode = str.Length();
598 if (seenSoftEnd) {
599 // check whether we can stop after this
600 for (PRInt32 i = node == mSoftEnd.mNode ? mSoftEnd.mOffset : 0;
601 i < PRInt32(str.Length()); ++i) {
602 if (IsDOMWordSeparator(str.CharAt(i))) {
603 exit = PR_TRUE;
604 // stop at the first separator after the soft end point
605 lastOffsetInNode = i;
606 break;
611 if (firstOffsetInNode < lastOffsetInNode) {
612 PRInt32 len = lastOffsetInNode - firstOffsetInNode;
613 mSoftTextDOMMapping.AppendElement(
614 DOMTextMapping(NodeOffset(node, firstOffsetInNode), mSoftText.Length(), len));
615 mSoftText.Append(Substring(str, firstOffsetInNode, len));
618 firstOffsetInNode = 0;
621 if (exit)
622 break;
624 CheckLeavingBreakElementClosure closure = { mCSSView, PR_FALSE };
625 node = FindNextNode(node, mRootNode, CheckLeavingBreakElement, &closure);
626 if (closure.mLeftBreakElement || (node && IsBreakElement(mCSSView, node))) {
627 // We left, or are entering, a break element (e.g., block). Maybe we can
628 // stop now.
629 if (seenSoftEnd)
630 break;
631 // Record the break
632 mSoftText.Append(' ');
636 #ifdef DEBUG_SPELLCHECK
637 printf("Got DOM string: %s\n", NS_ConvertUTF16toUTF8(mSoftText).get());
638 #endif
641 void
642 mozInlineSpellWordUtil::BuildRealWords()
644 // This is pretty simple. We just have to walk mSoftText, tokenizing it
645 // into "real words".
646 // We do an outer traversal of words delimited by IsDOMWordSeparator, calling
647 // SplitDOMWord on each of those DOM words
648 PRInt32 wordStart = -1;
649 mRealWords.Clear();
650 for (PRInt32 i = 0; i < PRInt32(mSoftText.Length()); ++i) {
651 if (IsDOMWordSeparator(mSoftText.CharAt(i))) {
652 if (wordStart >= 0) {
653 SplitDOMWord(wordStart, i);
654 wordStart = -1;
656 } else {
657 if (wordStart < 0) {
658 wordStart = i;
662 if (wordStart >= 0) {
663 SplitDOMWord(wordStart, mSoftText.Length());
667 /*********** DOM/realwords<->mSoftText mapping functions ************/
669 PRInt32
670 mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(NodeOffset aNodeOffset)
672 if (!mSoftTextValid) {
673 NS_ERROR("Soft text must be valid if we're to map into it");
674 return -1;
677 for (PRInt32 i = 0; i < PRInt32(mSoftTextDOMMapping.Length()); ++i) {
678 const DOMTextMapping& map = mSoftTextDOMMapping[i];
679 if (map.mNodeOffset.mNode == aNodeOffset.mNode) {
680 // Allow offsets at either end of the string, in particular, allow the
681 // offset that's at the end of the contributed string
682 PRInt32 offsetInContributedString =
683 aNodeOffset.mOffset - map.mNodeOffset.mOffset;
684 if (offsetInContributedString >= 0 &&
685 offsetInContributedString <= map.mLength)
686 return map.mSoftTextOffset + offsetInContributedString;
687 return -1;
690 return -1;
693 mozInlineSpellWordUtil::NodeOffset
694 mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(PRInt32 aSoftTextOffset,
695 DOMMapHint aHint)
697 NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
698 if (!mSoftTextValid)
699 return NodeOffset(nsnull, -1);
701 // The invariant is that the range start..end includes the last mapping,
702 // if any, such that mSoftTextOffset <= aSoftTextOffset
703 PRInt32 start = 0;
704 PRInt32 end = mSoftTextDOMMapping.Length();
705 while (end - start >= 2) {
706 PRInt32 mid = (start + end)/2;
707 const DOMTextMapping& map = mSoftTextDOMMapping[mid];
708 if (map.mSoftTextOffset > aSoftTextOffset) {
709 end = mid;
710 } else {
711 start = mid;
715 if (start >= end)
716 return NodeOffset(nsnull, -1);
718 // 'start' is now the last mapping, if any, such that
719 // mSoftTextOffset <= aSoftTextOffset.
720 // If we're doing HINT_END, then we may want to return the end of the
721 // the previous mapping instead of the start of this mapping
722 if (aHint == HINT_END && start > 0) {
723 const DOMTextMapping& map = mSoftTextDOMMapping[start - 1];
724 if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)
725 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + map.mLength);
728 // We allow ourselves to return the end of this mapping even if we're
729 // doing HINT_START. This will only happen if there is no mapping which this
730 // point is the start of. I'm not 100% sure this is OK...
731 const DOMTextMapping& map = mSoftTextDOMMapping[start];
732 PRInt32 offset = aSoftTextOffset - map.mSoftTextOffset;
733 if (offset >= 0 && offset <= map.mLength)
734 return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);
736 return NodeOffset(nsnull, -1);
739 PRInt32
740 mozInlineSpellWordUtil::FindRealWordContaining(PRInt32 aSoftTextOffset,
741 DOMMapHint aHint, PRBool aSearchForward)
743 NS_ASSERTION(mSoftTextValid, "Soft text must be valid if we're to map out of it");
744 if (!mSoftTextValid)
745 return -1;
747 // The invariant is that the range start..end includes the last word,
748 // if any, such that mSoftTextOffset <= aSoftTextOffset
749 PRInt32 start = 0;
750 PRInt32 end = mRealWords.Length();
751 while (end - start >= 2) {
752 PRInt32 mid = (start + end)/2;
753 const RealWord& word = mRealWords[mid];
754 if (word.mSoftTextOffset > aSoftTextOffset) {
755 end = mid;
756 } else {
757 start = mid;
761 if (start >= end)
762 return -1;
764 // 'start' is now the last word, if any, such that
765 // mSoftTextOffset <= aSoftTextOffset.
766 // If we're doing HINT_END, then we may want to return the end of the
767 // the previous word instead of the start of this word
768 if (aHint == HINT_END && start > 0) {
769 const RealWord& word = mRealWords[start - 1];
770 if (word.mSoftTextOffset + word.mLength == aSoftTextOffset)
771 return start - 1;
774 // We allow ourselves to return the end of this word even if we're
775 // doing HINT_START. This will only happen if there is no word which this
776 // point is the start of. I'm not 100% sure this is OK...
777 const RealWord& word = mRealWords[start];
778 PRInt32 offset = aSoftTextOffset - word.mSoftTextOffset;
779 if (offset >= 0 && offset <= word.mLength)
780 return start;
782 if (aSearchForward) {
783 if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {
784 // All words have mSoftTextOffset > aSoftTextOffset
785 return 0;
787 // 'start' is the last word such that mSoftTextOffset <= aSoftTextOffset.
788 // Word start+1, if it exists, will be the first with
789 // mSoftTextOffset > aSoftTextOffset.
790 if (start + 1 < PRInt32(mRealWords.Length()))
791 return start + 1;
794 return -1;
797 /*********** Word Splitting ************/
799 // classifies a given character in the DOM word
800 enum CharClass {
801 CHAR_CLASS_WORD,
802 CHAR_CLASS_SEPARATOR,
803 CHAR_CLASS_END_OF_INPUT };
805 // Encapsulates DOM-word to real-word splitting
806 struct WordSplitState
808 mozInlineSpellWordUtil* mWordUtil;
809 const nsDependentSubstring mDOMWordText;
810 PRInt32 mDOMWordOffset;
811 CharClass mCurCharClass;
813 WordSplitState(mozInlineSpellWordUtil* aWordUtil,
814 const nsString& aString, PRInt32 aStart, PRInt32 aLen)
815 : mWordUtil(aWordUtil), mDOMWordText(aString, aStart, aLen),
816 mDOMWordOffset(0), mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}
818 CharClass ClassifyCharacter(PRInt32 aIndex, PRBool aRecurse) const;
819 void Advance();
820 void AdvanceThroughSeparators();
821 void AdvanceThroughWord();
823 // Finds special words like email addresses and URLs that may start at the
824 // current position, and returns their length, or 0 if not found. This allows
825 // arbitrary word breaking rules to be used for these special entities, as
826 // long as they can not contain whitespace.
827 PRInt32 FindSpecialWord();
829 // Similar to FindSpecialWord except that this takes a split word as
830 // input. This checks for things that do not require special word-breaking
831 // rules.
832 PRBool ShouldSkipWord(PRInt32 aStart, PRInt32 aLength);
835 // WordSplitState::ClassifyCharacter
837 CharClass
838 WordSplitState::ClassifyCharacter(PRInt32 aIndex, PRBool aRecurse) const
840 NS_ASSERTION(aIndex >= 0 && aIndex <= PRInt32(mDOMWordText.Length()),
841 "Index out of range");
842 if (aIndex == PRInt32(mDOMWordText.Length()))
843 return CHAR_CLASS_SEPARATOR;
845 // this will classify the character, we want to treat "ignorable" characters
846 // such as soft hyphens as word characters.
847 nsIUGenCategory::nsUGenCategory
848 charCategory = mWordUtil->GetCategories()->Get(PRUint32(mDOMWordText[aIndex]));
849 if (charCategory == nsIUGenCategory::kLetter ||
850 IsIgnorableCharacter(mDOMWordText[aIndex]))
851 return CHAR_CLASS_WORD;
853 // If conditional punctuation is surrounded immediately on both sides by word
854 // characters it also counts as a word character.
855 if (IsConditionalPunctuation(mDOMWordText[aIndex])) {
856 if (!aRecurse) {
857 // not allowed to look around, this punctuation counts like a separator
858 return CHAR_CLASS_SEPARATOR;
861 // check the left-hand character
862 if (aIndex == 0)
863 return CHAR_CLASS_SEPARATOR;
864 if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)
865 return CHAR_CLASS_SEPARATOR;
867 // now we know left char is a word-char, check the right-hand character
868 if (aIndex == PRInt32(mDOMWordText.Length()) - 1)
869 return CHAR_CLASS_SEPARATOR;
870 if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)
871 return CHAR_CLASS_SEPARATOR;
873 // char on either side is a word, this counts as a word
874 return CHAR_CLASS_WORD;
877 // all other punctuation
878 if (charCategory == nsIUGenCategory::kSeparator ||
879 charCategory == nsIUGenCategory::kOther ||
880 charCategory == nsIUGenCategory::kPunctuation ||
881 charCategory == nsIUGenCategory::kSymbol)
882 return CHAR_CLASS_SEPARATOR;
884 // any other character counts as a word
885 return CHAR_CLASS_WORD;
889 // WordSplitState::Advance
891 void
892 WordSplitState::Advance()
894 NS_ASSERTION(mDOMWordOffset >= 0, "Negative word index");
895 NS_ASSERTION(mDOMWordOffset < (PRInt32)mDOMWordText.Length(),
896 "Length beyond end");
898 mDOMWordOffset ++;
899 if (mDOMWordOffset >= (PRInt32)mDOMWordText.Length())
900 mCurCharClass = CHAR_CLASS_END_OF_INPUT;
901 else
902 mCurCharClass = ClassifyCharacter(mDOMWordOffset, PR_TRUE);
906 // WordSplitState::AdvanceThroughSeparators
908 void
909 WordSplitState::AdvanceThroughSeparators()
911 while (mCurCharClass == CHAR_CLASS_SEPARATOR)
912 Advance();
915 // WordSplitState::AdvanceThroughWord
917 void
918 WordSplitState::AdvanceThroughWord()
920 while (mCurCharClass == CHAR_CLASS_WORD)
921 Advance();
925 // WordSplitState::FindSpecialWord
927 PRInt32
928 WordSplitState::FindSpecialWord()
930 PRInt32 i;
932 // Search for email addresses. We simply define these as any sequence of
933 // characters with an '@' character in the middle. The DOM word is already
934 // split on whitepace, so we know that everything to the end is the address
936 // Also look for periods, this tells us if we want to run the URL finder.
937 PRBool foundDot = PR_FALSE;
938 PRInt32 firstColon = -1;
939 for (i = mDOMWordOffset;
940 i < PRInt32(mDOMWordText.Length()); i ++) {
941 if (mDOMWordText[i] == '@') {
942 // only accept this if there are unambiguous word characters (don't bother
943 // recursing to disambiguate apostrophes) on each side. This prevents
944 // classifying, e.g. "@home" as an email address
946 // Use this condition to only accept words with '@' in the middle of
947 // them. It works, but the inlinespellcker doesn't like this. The problem
948 // is that you type "fhsgfh@" that's a misspelled word followed by a
949 // symbol, but when you type another letter "fhsgfh@g" that first word
950 // need to be unmarked misspelled. It doesn't do this. it only checks the
951 // current position for potentially removing a spelling range.
952 if (i > 0 && ClassifyCharacter(i - 1, PR_FALSE) == CHAR_CLASS_WORD &&
953 i < (PRInt32)mDOMWordText.Length() - 1 &&
954 ClassifyCharacter(i + 1, PR_FALSE) == CHAR_CLASS_WORD)
956 return mDOMWordText.Length() - mDOMWordOffset;
957 } else if (mDOMWordText[i] == '.' && ! foundDot &&
958 i > 0 && i < (PRInt32)mDOMWordText.Length() - 1) {
959 // we found a period not at the end, we should check harder for URLs
960 foundDot = PR_TRUE;
961 } else if (mDOMWordText[i] == ':' && firstColon < 0) {
962 firstColon = i;
966 // If the first colon is followed by a slash, consider it a URL
967 // This will catch things like asdf://foo.com
968 if (firstColon >= 0 && firstColon < (PRInt32)mDOMWordText.Length() - 1 &&
969 mDOMWordText[firstColon + 1] == '/') {
970 return mDOMWordText.Length() - mDOMWordOffset;
973 // Check the text before the first colon against some known protocols. It
974 // is impossible to check against all protocols, especially since you can
975 // plug in new protocols. We also don't want to waste time here checking
976 // against a lot of obscure protocols.
977 if (firstColon > mDOMWordOffset) {
978 nsString protocol(Substring(mDOMWordText, mDOMWordOffset,
979 firstColon - mDOMWordOffset));
980 if (protocol.EqualsIgnoreCase("http") ||
981 protocol.EqualsIgnoreCase("https") ||
982 protocol.EqualsIgnoreCase("news") ||
983 protocol.EqualsIgnoreCase("ftp") ||
984 protocol.EqualsIgnoreCase("file") ||
985 protocol.EqualsIgnoreCase("javascript") ||
986 protocol.EqualsIgnoreCase("ftp")) {
987 return mDOMWordText.Length() - mDOMWordOffset;
991 // not anything special
992 return -1;
995 // WordSplitState::ShouldSkipWord
997 PRBool
998 WordSplitState::ShouldSkipWord(PRInt32 aStart, PRInt32 aLength)
1000 PRInt32 last = aStart + aLength;
1002 // check to see if the word contains a digit
1003 for (PRInt32 i = aStart; i < last; i ++) {
1004 PRUnichar ch = mDOMWordText[i];
1005 // XXX Shouldn't this be something a lot more complex, Unicode-based?
1006 if (ch >= '0' && ch <= '9')
1007 return PR_TRUE;
1010 // not special
1011 return PR_FALSE;
1014 // mozInlineSpellWordUtil::SplitDOMWord
1016 void
1017 mozInlineSpellWordUtil::SplitDOMWord(PRInt32 aStart, PRInt32 aEnd)
1019 WordSplitState state(this, mSoftText, aStart, aEnd - aStart);
1020 state.mCurCharClass = state.ClassifyCharacter(0, PR_TRUE);
1022 while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {
1023 state.AdvanceThroughSeparators();
1024 if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT)
1025 break;
1027 PRInt32 specialWordLength = state.FindSpecialWord();
1028 if (specialWordLength > 0) {
1029 mRealWords.AppendElement(
1030 RealWord(aStart + state.mDOMWordOffset, specialWordLength, PR_FALSE));
1032 // skip the special word
1033 state.mDOMWordOffset += specialWordLength;
1034 if (state.mDOMWordOffset + aStart >= aEnd)
1035 state.mCurCharClass = CHAR_CLASS_END_OF_INPUT;
1036 else
1037 state.mCurCharClass = state.ClassifyCharacter(state.mDOMWordOffset, PR_TRUE);
1038 continue;
1041 // save the beginning of the word
1042 PRInt32 wordOffset = state.mDOMWordOffset;
1044 // find the end of the word
1045 state.AdvanceThroughWord();
1046 PRInt32 wordLen = state.mDOMWordOffset - wordOffset;
1047 mRealWords.AppendElement(
1048 RealWord(aStart + wordOffset, wordLen,
1049 !state.ShouldSkipWord(wordOffset, wordLen)));