Bug 1945643 - Update to mozilla-nimbus-schemas 2025.1.1 r=chumphreys
[gecko.git] / dom / serializers / nsPlainTextSerializer.cpp
blob62a438e50840e62aeb0a99fd411bb967a0bf5b22
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 /*
8 * nsIContentSerializer implementation that can be used with an
9 * nsIDocumentEncoder to convert a DOM into plaintext in a nice way
10 * (eg for copy/paste as plaintext).
13 #include "nsPlainTextSerializer.h"
15 #include <limits>
17 #include "nsPrintfCString.h"
18 #include "nsDebug.h"
19 #include "nsGkAtoms.h"
20 #include "nsNameSpaceManager.h"
21 #include "nsTextFragment.h"
22 #include "nsContentUtils.h"
23 #include "nsReadableUtils.h"
24 #include "nsUnicharUtils.h"
25 #include "nsCRT.h"
26 #include "mozilla/Casting.h"
27 #include "mozilla/TextEditor.h"
28 #include "mozilla/dom/CharacterData.h"
29 #include "mozilla/dom/Element.h"
30 #include "mozilla/dom/HTMLBRElement.h"
31 #include "mozilla/dom/Text.h"
32 #include "mozilla/intl/Segmenter.h"
33 #include "mozilla/intl/UnicodeProperties.h"
34 #include "mozilla/dom/AbstractRange.h"
35 #include "nsUnicodeProperties.h"
36 #include "mozilla/Span.h"
37 #include "mozilla/Preferences.h"
38 #include "mozilla/StaticPrefs_converter.h"
39 #include "nsComputedDOMStyle.h"
41 namespace mozilla {
42 class Encoding;
45 using namespace mozilla;
46 using namespace mozilla::dom;
48 #define PREF_STRUCTS "converter.html2txt.structs"
49 #define PREF_HEADER_STRATEGY "converter.html2txt.header_strategy"
51 static const int32_t kTabSize = 4;
52 static const int32_t kIndentSizeHeaders =
53 2; /* Indention of h1, if
54 mHeaderStrategy = kIndentIncreasedWithHeaderLevel
55 or = kNumberHeadingsAndIndentSlightly. Indention of
56 other headers is derived from that. */
57 static const int32_t kIndentIncrementHeaders =
58 2; /* If mHeaderStrategy = kIndentIncreasedWithHeaderLevel,
59 indent h(x+1) this many
60 columns more than h(x) */
61 static const int32_t kIndentSizeList = kTabSize;
62 // Indention of non-first lines of ul and ol
63 static const int32_t kIndentSizeDD = kTabSize; // Indention of <dd>
64 static const char16_t kNBSP = 160;
65 static const char16_t kSPACE = ' ';
67 static int32_t HeaderLevel(const nsAtom* aTag);
68 static int32_t GetUnicharWidth(char32_t ucs);
69 static int32_t GetUnicharStringWidth(Span<const char16_t> aString);
71 // Someday may want to make this non-const:
72 static const uint32_t TagStackSize = 500;
74 NS_IMPL_CYCLE_COLLECTING_ADDREF(nsPlainTextSerializer)
75 NS_IMPL_CYCLE_COLLECTING_RELEASE(nsPlainTextSerializer)
77 NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsPlainTextSerializer)
78 NS_INTERFACE_MAP_ENTRY(nsIContentSerializer)
79 NS_INTERFACE_MAP_ENTRY(nsISupports)
80 NS_INTERFACE_MAP_END
82 NS_IMPL_CYCLE_COLLECTION(nsPlainTextSerializer, mElement)
84 nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer) {
85 RefPtr<nsPlainTextSerializer> it = new nsPlainTextSerializer();
86 it.forget(aSerializer);
87 return NS_OK;
90 // @param aFlags As defined in nsIDocumentEncoder.idl.
91 static void DetermineLineBreak(const int32_t aFlags, nsAString& aLineBreak) {
92 // Set the line break character:
93 if ((aFlags & nsIDocumentEncoder::OutputCRLineBreak) &&
94 (aFlags & nsIDocumentEncoder::OutputLFLineBreak)) {
95 // Windows
96 aLineBreak.AssignLiteral(u"\r\n");
97 } else if (aFlags & nsIDocumentEncoder::OutputCRLineBreak) {
98 // Mac
99 aLineBreak.AssignLiteral(u"\r");
100 } else if (aFlags & nsIDocumentEncoder::OutputLFLineBreak) {
101 // Unix/DOM
102 aLineBreak.AssignLiteral(u"\n");
103 } else {
104 // Platform/default
105 aLineBreak.AssignLiteral(NS_ULINEBREAK);
109 void nsPlainTextSerializer::CurrentLine::MaybeReplaceNbspsInContent(
110 const int32_t aFlags) {
111 if (!(aFlags & nsIDocumentEncoder::OutputPersistNBSP)) {
112 // First, replace all nbsp characters with spaces,
113 // which the unicode encoder won't do for us.
114 mContent.ReplaceChar(kNBSP, kSPACE);
118 void nsPlainTextSerializer::CurrentLine::ResetContentAndIndentationHeader() {
119 mContent.Truncate();
120 mIndentation.mHeader.Truncate();
123 int32_t nsPlainTextSerializer::CurrentLine::FindWrapIndexForContent(
124 const uint32_t aWrapColumn, bool aUseLineBreaker) const {
125 MOZ_ASSERT(!mContent.IsEmpty());
127 const uint32_t prefixwidth = DeterminePrefixWidth();
128 int32_t goodSpace = 0;
130 if (aUseLineBreaker) {
131 // We advance one line break point at a time from the beginning of the
132 // mContent until we find a width less than or equal to wrap column.
133 uint32_t width = 0;
134 intl::LineBreakIteratorUtf16 lineBreakIter(mContent);
135 while (Maybe<uint32_t> nextGoodSpace = lineBreakIter.Next()) {
136 // Trim space at the tail. UAX#14 doesn't have break opportunity for
137 // ASCII space at the tail.
138 const Maybe<uint32_t> originalNextGoodSpace = nextGoodSpace;
139 while (*nextGoodSpace > 0 &&
140 mContent.CharAt(*nextGoodSpace - 1) == 0x20) {
141 nextGoodSpace = Some(*nextGoodSpace - 1);
143 if (*nextGoodSpace == 0) {
144 // Restore the original nextGoodSpace.
145 nextGoodSpace = originalNextGoodSpace;
148 width += GetUnicharStringWidth(Span<const char16_t>(
149 mContent.get() + goodSpace, *nextGoodSpace - goodSpace));
150 if (prefixwidth + width > aWrapColumn) {
151 // The next break point makes the width exceeding the wrap column, so
152 // goodSpace is what we want.
153 break;
155 goodSpace = AssertedCast<int32_t>(*nextGoodSpace);
158 return goodSpace;
161 // In this case we don't want strings, especially CJK-ones, to be split. See
162 // bug 333064 for more information. We break only at ASCII spaces.
163 if (aWrapColumn >= prefixwidth) {
164 // Search backward from the adjusted wrap column or from the text end.
165 goodSpace =
166 std::min<int32_t>(aWrapColumn - prefixwidth, mContent.Length() - 1);
167 while (goodSpace >= 0) {
168 if (nsCRT::IsAsciiSpace(mContent.CharAt(goodSpace))) {
169 return goodSpace;
171 goodSpace--;
175 // Search forward from the adjusted wrap column.
176 goodSpace = (prefixwidth > aWrapColumn) ? 1 : aWrapColumn - prefixwidth;
177 const int32_t contentLength = mContent.Length();
178 while (goodSpace < contentLength &&
179 !nsCRT::IsAsciiSpace(mContent.CharAt(goodSpace))) {
180 goodSpace++;
183 return goodSpace;
186 nsPlainTextSerializer::OutputManager::OutputManager(const int32_t aFlags,
187 nsAString& aOutput)
188 : mFlags{aFlags}, mOutput{aOutput}, mAtFirstColumn{true} {
189 MOZ_ASSERT(aOutput.IsEmpty());
191 DetermineLineBreak(mFlags, mLineBreak);
194 void nsPlainTextSerializer::OutputManager::Append(
195 const CurrentLine& aCurrentLine,
196 const StripTrailingWhitespaces aStripTrailingWhitespaces) {
197 if (IsAtFirstColumn()) {
198 nsAutoString quotesAndIndent;
199 aCurrentLine.CreateQuotesAndIndent(quotesAndIndent);
201 if ((aStripTrailingWhitespaces == StripTrailingWhitespaces::kMaybe)) {
202 const bool stripTrailingSpaces = aCurrentLine.mContent.IsEmpty();
203 if (stripTrailingSpaces) {
204 quotesAndIndent.Trim(" ", false, true, false);
208 Append(quotesAndIndent);
211 Append(aCurrentLine.mContent);
214 void nsPlainTextSerializer::OutputManager::Append(const nsAString& aString) {
215 if (!aString.IsEmpty()) {
216 mOutput.Append(aString);
217 mAtFirstColumn = false;
221 void nsPlainTextSerializer::OutputManager::AppendLineBreak() {
222 mOutput.Append(mLineBreak);
223 mAtFirstColumn = true;
226 uint32_t nsPlainTextSerializer::OutputManager::GetOutputLength() const {
227 return mOutput.Length();
230 nsPlainTextSerializer::nsPlainTextSerializer()
231 : mFloatingLines(-1),
232 mLineBreakDue(false),
233 kSpace(u" "_ns) // Init of "constant"
235 mHeadLevel = 0;
236 mHasWrittenCiteBlockquote = false;
237 mSpanLevel = 0;
238 for (int32_t i = 0; i <= 6; i++) {
239 mHeaderCounter[i] = 0;
242 // Flow
243 mEmptyLines = 1; // The start of the document is an "empty line" in itself,
244 mInWhitespace = false;
245 mPreFormattedMail = false;
247 mPreformattedBlockBoundary = false;
249 // initialize the tag stack to zero:
250 // The stack only ever contains pointers to static atoms, so they don't
251 // need refcounting.
252 mTagStack = new const nsAtom*[TagStackSize];
253 mTagStackIndex = 0;
254 mIgnoreAboveIndex = (uint32_t)kNotFound;
256 mULCount = 0;
258 mIgnoredChildNodeLevel = 0;
261 nsPlainTextSerializer::~nsPlainTextSerializer() {
262 delete[] mTagStack;
263 NS_WARNING_ASSERTION(mHeadLevel == 0, "Wrong head level!");
266 nsPlainTextSerializer::Settings::HeaderStrategy
267 nsPlainTextSerializer::Settings::Convert(const int32_t aPrefHeaderStrategy) {
268 HeaderStrategy result{HeaderStrategy::kIndentIncreasedWithHeaderLevel};
270 switch (aPrefHeaderStrategy) {
271 case 0: {
272 result = HeaderStrategy::kNoIndentation;
273 break;
275 case 1: {
276 result = HeaderStrategy::kIndentIncreasedWithHeaderLevel;
277 break;
279 case 2: {
280 result = HeaderStrategy::kNumberHeadingsAndIndentSlightly;
281 break;
283 default: {
284 NS_WARNING(
285 nsPrintfCString("Header strategy pref contains undefined value: %i",
286 aPrefHeaderStrategy)
287 .get());
291 return result;
294 const int32_t kDefaultHeaderStrategy = 1;
296 void nsPlainTextSerializer::Settings::Init(const int32_t aFlags,
297 const uint32_t aWrapColumn) {
298 mFlags = aFlags;
300 if (mFlags & nsIDocumentEncoder::OutputFormatted) {
301 // Get some prefs that controls how we do formatted output
302 mStructs = Preferences::GetBool(PREF_STRUCTS, mStructs);
304 int32_t headerStrategy =
305 Preferences::GetInt(PREF_HEADER_STRATEGY, kDefaultHeaderStrategy);
306 mHeaderStrategy = Convert(headerStrategy);
309 mWithRubyAnnotation = StaticPrefs::converter_html2txt_always_include_ruby() ||
310 (mFlags & nsIDocumentEncoder::OutputRubyAnnotation);
312 // XXX We should let the caller decide whether to do this or not
313 mFlags &= ~nsIDocumentEncoder::OutputNoFramesContent;
315 mWrapColumn = aWrapColumn;
318 NS_IMETHODIMP
319 nsPlainTextSerializer::Init(const uint32_t aFlags, uint32_t aWrapColumn,
320 const Encoding* aEncoding, bool aIsCopying,
321 bool aIsWholeDocument,
322 bool* aNeedsPreformatScanning, nsAString& aOutput) {
323 #ifdef DEBUG
324 // Check if the major control flags are set correctly.
325 if (aFlags & nsIDocumentEncoder::OutputFormatFlowed) {
326 // One of OutputFormatted or OutputWrap must be set, but not both.
327 NS_ASSERTION((aFlags & nsIDocumentEncoder::OutputFormatted) !=
328 (aFlags & nsIDocumentEncoder::OutputWrap),
329 "If you want format=flowed, you must combine it "
330 "with either nsIDocumentEncoder::OutputFormatted "
331 "or nsIDocumentEncoder::OutputWrap");
334 if (aFlags & nsIDocumentEncoder::OutputFormatted) {
335 NS_ASSERTION(
336 !(aFlags & nsIDocumentEncoder::OutputPreformatted),
337 "Can't do formatted and preformatted output at the same time!");
339 #endif
340 MOZ_ASSERT(!(aFlags & nsIDocumentEncoder::OutputFormatDelSp) ||
341 (aFlags & nsIDocumentEncoder::OutputFormatFlowed));
343 *aNeedsPreformatScanning = true;
344 mSettings.Init(aFlags, aWrapColumn);
345 mOutputManager.emplace(mSettings.GetFlags(), aOutput);
347 mUseLineBreaker = mSettings.MayWrap() && mSettings.MayBreakLines();
349 mLineBreakDue = false;
350 mFloatingLines = -1;
352 mPreformattedBlockBoundary = false;
354 MOZ_ASSERT(mOLStack.IsEmpty());
356 return NS_OK;
359 bool nsPlainTextSerializer::GetLastBool(const nsTArray<bool>& aStack) {
360 uint32_t size = aStack.Length();
361 if (size == 0) {
362 return false;
364 return aStack.ElementAt(size - 1);
367 void nsPlainTextSerializer::SetLastBool(nsTArray<bool>& aStack, bool aValue) {
368 uint32_t size = aStack.Length();
369 if (size > 0) {
370 aStack.ElementAt(size - 1) = aValue;
371 } else {
372 NS_ERROR("There is no \"Last\" value");
376 void nsPlainTextSerializer::PushBool(nsTArray<bool>& aStack, bool aValue) {
377 aStack.AppendElement(bool(aValue));
380 bool nsPlainTextSerializer::PopBool(nsTArray<bool>& aStack) {
381 return aStack.Length() ? aStack.PopLastElement() : false;
384 bool nsPlainTextSerializer::IsIgnorableRubyAnnotation(
385 const nsAtom* aTag) const {
386 if (mSettings.GetWithRubyAnnotation()) {
387 return false;
390 return aTag == nsGkAtoms::rp || aTag == nsGkAtoms::rt ||
391 aTag == nsGkAtoms::rtc;
394 // Return true if aElement has 'display:none' or if we just don't know.
395 static bool IsDisplayNone(Element* aElement) {
396 RefPtr<const ComputedStyle> computedStyle =
397 nsComputedDOMStyle::GetComputedStyleNoFlush(aElement);
398 return !computedStyle ||
399 computedStyle->StyleDisplay()->mDisplay == StyleDisplay::None;
402 static bool IsIgnorableScriptOrStyle(Element* aElement) {
403 return aElement->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style) &&
404 IsDisplayNone(aElement);
407 NS_IMETHODIMP
408 nsPlainTextSerializer::AppendText(nsIContent* aText, int32_t aStartOffset,
409 int32_t aEndOffset) {
410 if (mIgnoreAboveIndex != (uint32_t)kNotFound) {
411 return NS_OK;
414 NS_ASSERTION(aStartOffset >= 0, "Negative start offset for text fragment!");
415 if (aStartOffset < 0) return NS_ERROR_INVALID_ARG;
417 NS_ENSURE_ARG(aText);
419 nsresult rv = NS_OK;
421 nsIContent* content = aText;
422 const nsTextFragment* frag;
423 if (!content || !(frag = content->GetText())) {
424 return NS_ERROR_FAILURE;
427 int32_t fragLength = frag->GetLength();
428 int32_t endoffset =
429 (aEndOffset == -1) ? fragLength : std::min(aEndOffset, fragLength);
430 NS_ASSERTION(aStartOffset <= endoffset,
431 "A start offset is beyond the end of the text fragment!");
433 int32_t length = endoffset - aStartOffset;
434 if (length <= 0) {
435 return NS_OK;
438 nsAutoString textstr;
439 if (frag->Is2b()) {
440 textstr.Assign(frag->Get2b() + aStartOffset, length);
441 } else {
442 // AssignASCII is for 7-bit character only, so don't use it
443 const char* data = frag->Get1b();
444 CopyASCIItoUTF16(Substring(data + aStartOffset, data + endoffset), textstr);
447 // Mask the text if the text node is in a password field.
448 if (content->HasFlag(NS_MAYBE_MASKED)) {
449 TextEditor::MaskString(textstr, *content->AsText(), 0, aStartOffset);
452 // We have to split the string across newlines
453 // to match parser behavior
454 int32_t start = 0;
455 int32_t offset = textstr.FindCharInSet(u"\n\r");
456 while (offset != kNotFound) {
457 if (offset > start) {
458 // Pass in the line
459 DoAddText(false, Substring(textstr, start, offset - start));
462 // Pass in a newline
463 DoAddText();
465 start = offset + 1;
466 offset = textstr.FindCharInSet(u"\n\r", start);
469 // Consume the last bit of the string if there's any left
470 if (start < length) {
471 if (start) {
472 DoAddText(false, Substring(textstr, start, length - start));
473 } else {
474 DoAddText(false, textstr);
478 return rv;
481 NS_IMETHODIMP
482 nsPlainTextSerializer::AppendCDATASection(nsIContent* aCDATASection,
483 int32_t aStartOffset,
484 int32_t aEndOffset) {
485 return AppendText(aCDATASection, aStartOffset, aEndOffset);
488 NS_IMETHODIMP
489 nsPlainTextSerializer::ScanElementForPreformat(Element* aElement) {
490 mPreformatStack.push(IsElementPreformatted(aElement));
491 return NS_OK;
494 NS_IMETHODIMP
495 nsPlainTextSerializer::ForgetElementForPreformat(Element* aElement) {
496 MOZ_RELEASE_ASSERT(!mPreformatStack.empty(),
497 "Tried to pop without previous push.");
498 mPreformatStack.pop();
499 return NS_OK;
502 NS_IMETHODIMP
503 nsPlainTextSerializer::AppendElementStart(Element* aElement,
504 Element* aOriginalElement) {
505 NS_ENSURE_ARG(aElement);
507 mElement = aElement;
509 nsresult rv;
510 nsAtom* id = GetIdForContent(mElement);
512 bool isContainer = !FragmentOrElement::IsHTMLVoid(id);
514 if (isContainer) {
515 rv = DoOpenContainer(id);
516 } else {
517 rv = DoAddLeaf(id);
520 mElement = nullptr;
522 if (id == nsGkAtoms::head) {
523 ++mHeadLevel;
526 return rv;
529 NS_IMETHODIMP
530 nsPlainTextSerializer::AppendElementEnd(Element* aElement,
531 Element* aOriginalElement) {
532 NS_ENSURE_ARG(aElement);
534 mElement = aElement;
536 nsresult rv;
537 nsAtom* id = GetIdForContent(mElement);
539 bool isContainer = !FragmentOrElement::IsHTMLVoid(id);
541 rv = NS_OK;
542 if (isContainer) {
543 rv = DoCloseContainer(id);
546 mElement = nullptr;
548 if (id == nsGkAtoms::head) {
549 NS_ASSERTION(mHeadLevel != 0, "mHeadLevel being decremented below 0");
550 --mHeadLevel;
553 return rv;
556 NS_IMETHODIMP
557 nsPlainTextSerializer::FlushAndFinish() {
558 MOZ_ASSERT(mOutputManager);
560 mOutputManager->Flush(mCurrentLine);
561 return Finish();
564 NS_IMETHODIMP
565 nsPlainTextSerializer::Finish() {
566 mOutputManager.reset();
568 return NS_OK;
571 NS_IMETHODIMP
572 nsPlainTextSerializer::GetOutputLength(uint32_t& aLength) const {
573 MOZ_ASSERT(mOutputManager);
575 aLength = mOutputManager->GetOutputLength();
577 return NS_OK;
580 NS_IMETHODIMP
581 nsPlainTextSerializer::AppendDocumentStart(Document* aDocument) {
582 return NS_OK;
585 constexpr int32_t kOlStackDummyValue = 0;
587 nsresult nsPlainTextSerializer::DoOpenContainer(const nsAtom* aTag) {
588 if (IsIgnorableRubyAnnotation(aTag)) {
589 // Ignorable ruby annotation shouldn't be replaced by a placeholder
590 // character, neither any of its descendants.
591 mIgnoredChildNodeLevel++;
592 return NS_OK;
594 if (IsIgnorableScriptOrStyle(mElement)) {
595 mIgnoredChildNodeLevel++;
596 return NS_OK;
599 if (mSettings.HasFlag(nsIDocumentEncoder::OutputForPlainTextClipboardCopy)) {
600 if (mPreformattedBlockBoundary && DoOutput()) {
601 // Should always end a line, but get no more whitespace
602 if (mFloatingLines < 0) mFloatingLines = 0;
603 mLineBreakDue = true;
605 mPreformattedBlockBoundary = false;
608 if (mSettings.HasFlag(nsIDocumentEncoder::OutputRaw)) {
609 // Raw means raw. Don't even think about doing anything fancy
610 // here like indenting, adding line breaks or any other
611 // characters such as list item bullets, quote characters
612 // around <q>, etc.
614 return NS_OK;
617 if (mTagStackIndex < TagStackSize) {
618 mTagStack[mTagStackIndex++] = aTag;
621 if (mIgnoreAboveIndex != (uint32_t)kNotFound) {
622 return NS_OK;
625 // Reset this so that <blockquote type=cite> doesn't affect the whitespace
626 // above random <pre>s below it.
627 mHasWrittenCiteBlockquote =
628 mHasWrittenCiteBlockquote && aTag == nsGkAtoms::pre;
630 bool isInCiteBlockquote = false;
632 // XXX special-case <blockquote type=cite> so that we don't add additional
633 // newlines before the text.
634 if (aTag == nsGkAtoms::blockquote) {
635 nsAutoString value;
636 nsresult rv = GetAttributeValue(nsGkAtoms::type, value);
637 isInCiteBlockquote = NS_SUCCEEDED(rv) && value.EqualsIgnoreCase("cite");
640 if (mLineBreakDue && !isInCiteBlockquote) EnsureVerticalSpace(mFloatingLines);
642 // Check if this tag's content that should not be output
643 if ((aTag == nsGkAtoms::noscript &&
644 !mSettings.HasFlag(nsIDocumentEncoder::OutputNoScriptContent)) ||
645 ((aTag == nsGkAtoms::iframe || aTag == nsGkAtoms::noframes) &&
646 !mSettings.HasFlag(nsIDocumentEncoder::OutputNoFramesContent))) {
647 // Ignore everything that follows the current tag in
648 // question until a matching end tag is encountered.
649 mIgnoreAboveIndex = mTagStackIndex - 1;
650 return NS_OK;
653 if (aTag == nsGkAtoms::body) {
654 // Try to figure out here whether we have a
655 // preformatted style attribute set by Thunderbird.
657 // Trigger on the presence of a "pre-wrap" in the
658 // style attribute. That's a very simplistic way to do
659 // it, but better than nothing.
660 nsAutoString style;
661 int32_t whitespace;
662 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::style, style)) &&
663 (kNotFound != (whitespace = style.Find(u"white-space:")))) {
664 if (kNotFound != style.LowerCaseFindASCII("pre-wrap", whitespace)) {
665 #ifdef DEBUG_preformatted
666 printf("Set mPreFormattedMail based on style pre-wrap\n");
667 #endif
668 mPreFormattedMail = true;
669 } else if (kNotFound != style.LowerCaseFindASCII("pre", whitespace)) {
670 #ifdef DEBUG_preformatted
671 printf("Set mPreFormattedMail based on style pre\n");
672 #endif
673 mPreFormattedMail = true;
675 } else {
676 /* See comment at end of function. */
677 mInWhitespace = true;
678 mPreFormattedMail = false;
681 return NS_OK;
684 // Keep this in sync with DoCloseContainer!
685 if (!DoOutput()) {
686 return NS_OK;
689 if (aTag == nsGkAtoms::p)
690 EnsureVerticalSpace(1);
691 else if (aTag == nsGkAtoms::pre) {
692 if (GetLastBool(mIsInCiteBlockquote))
693 EnsureVerticalSpace(0);
694 else if (mHasWrittenCiteBlockquote) {
695 EnsureVerticalSpace(0);
696 mHasWrittenCiteBlockquote = false;
697 } else
698 EnsureVerticalSpace(1);
699 } else if (aTag == nsGkAtoms::tr) {
700 PushBool(mHasWrittenCellsForRow, false);
701 } else if (aTag == nsGkAtoms::td || aTag == nsGkAtoms::th) {
702 // We must make sure that the content of two table cells get a
703 // space between them.
705 // To make the separation between cells most obvious and
706 // importable, we use a TAB.
707 if (mHasWrittenCellsForRow.IsEmpty()) {
708 // We don't always see a <tr> (nor a <table>) before the <td> if we're
709 // copying part of a table
710 PushBool(mHasWrittenCellsForRow, true); // will never be popped
711 } else if (GetLastBool(mHasWrittenCellsForRow)) {
712 // Bypass |Write| so that the TAB isn't compressed away.
713 AddToLine(u"\t", 1);
714 mInWhitespace = true;
715 } else {
716 SetLastBool(mHasWrittenCellsForRow, true);
718 } else if (aTag == nsGkAtoms::ul) {
719 // Indent here to support nested lists, which aren't included in li :-(
720 EnsureVerticalSpace(IsInOlOrUl() ? 0 : 1);
721 // Must end the current line before we change indention
722 mCurrentLine.mIndentation.mLength += kIndentSizeList;
723 mULCount++;
724 } else if (aTag == nsGkAtoms::ol) {
725 EnsureVerticalSpace(IsInOlOrUl() ? 0 : 1);
726 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) {
727 // Must end the current line before we change indention
728 nsAutoString startAttr;
729 int32_t startVal = 1;
730 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::start, startAttr))) {
731 nsresult rv = NS_OK;
732 startVal = startAttr.ToInteger(&rv);
733 if (NS_FAILED(rv)) {
734 startVal = 1;
737 mOLStack.AppendElement(startVal);
738 } else {
739 mOLStack.AppendElement(kOlStackDummyValue);
741 mCurrentLine.mIndentation.mLength += kIndentSizeList; // see ul
742 } else if (aTag == nsGkAtoms::li &&
743 mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) {
744 if (mTagStackIndex > 1 && IsInOL()) {
745 if (!mOLStack.IsEmpty()) {
746 nsAutoString valueAttr;
747 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::value, valueAttr))) {
748 nsresult rv = NS_OK;
749 int32_t valueAttrVal = valueAttr.ToInteger(&rv);
750 if (NS_SUCCEEDED(rv)) {
751 mOLStack.LastElement() = valueAttrVal;
754 // This is what nsBulletFrame does for OLs:
755 mCurrentLine.mIndentation.mHeader.AppendInt(mOLStack.LastElement(), 10);
756 mOLStack.LastElement()++;
757 } else {
758 mCurrentLine.mIndentation.mHeader.Append(char16_t('#'));
761 mCurrentLine.mIndentation.mHeader.Append(char16_t('.'));
763 } else {
764 static const char bulletCharArray[] = "*o+#";
765 uint32_t index = mULCount > 0 ? (mULCount - 1) : 3;
766 char bulletChar = bulletCharArray[index % 4];
767 mCurrentLine.mIndentation.mHeader.Append(char16_t(bulletChar));
770 mCurrentLine.mIndentation.mHeader.Append(char16_t(' '));
771 } else if (aTag == nsGkAtoms::dl) {
772 EnsureVerticalSpace(1);
773 } else if (aTag == nsGkAtoms::dt) {
774 EnsureVerticalSpace(0);
775 } else if (aTag == nsGkAtoms::dd) {
776 EnsureVerticalSpace(0);
777 mCurrentLine.mIndentation.mLength += kIndentSizeDD;
778 } else if (aTag == nsGkAtoms::span) {
779 ++mSpanLevel;
780 } else if (aTag == nsGkAtoms::blockquote) {
781 // Push
782 PushBool(mIsInCiteBlockquote, isInCiteBlockquote);
783 if (isInCiteBlockquote) {
784 EnsureVerticalSpace(0);
785 mCurrentLine.mCiteQuoteLevel++;
786 } else {
787 EnsureVerticalSpace(1);
788 mCurrentLine.mIndentation.mLength +=
789 kTabSize; // Check for some maximum value?
791 } else if (aTag == nsGkAtoms::q) {
792 Write(u"\""_ns);
795 // Else make sure we'll separate block level tags,
796 // even if we're about to leave, before doing any other formatting.
797 else if (IsCssBlockLevelElement(mElement)) {
798 EnsureVerticalSpace(0);
801 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) {
802 OpenContainerForOutputFormatted(aTag);
804 return NS_OK;
807 void nsPlainTextSerializer::OpenContainerForOutputFormatted(
808 const nsAtom* aTag) {
809 const bool currentNodeIsConverted = IsCurrentNodeConverted();
811 if (aTag == nsGkAtoms::h1 || aTag == nsGkAtoms::h2 || aTag == nsGkAtoms::h3 ||
812 aTag == nsGkAtoms::h4 || aTag == nsGkAtoms::h5 || aTag == nsGkAtoms::h6) {
813 EnsureVerticalSpace(2);
814 if (mSettings.GetHeaderStrategy() ==
815 Settings::HeaderStrategy::kNumberHeadingsAndIndentSlightly) {
816 mCurrentLine.mIndentation.mLength += kIndentSizeHeaders;
817 // Caching
818 int32_t level = HeaderLevel(aTag);
819 // Increase counter for current level
820 mHeaderCounter[level]++;
821 // Reset all lower levels
822 int32_t i;
824 for (i = level + 1; i <= 6; i++) {
825 mHeaderCounter[i] = 0;
828 // Construct numbers
829 nsAutoString leadup;
830 for (i = 1; i <= level; i++) {
831 leadup.AppendInt(mHeaderCounter[i]);
832 leadup.Append(char16_t('.'));
834 leadup.Append(char16_t(' '));
835 Write(leadup);
836 } else if (mSettings.GetHeaderStrategy() ==
837 Settings::HeaderStrategy::kIndentIncreasedWithHeaderLevel) {
838 mCurrentLine.mIndentation.mLength += kIndentSizeHeaders;
839 for (int32_t i = HeaderLevel(aTag); i > 1; i--) {
840 // for h(x), run x-1 times
841 mCurrentLine.mIndentation.mLength += kIndentIncrementHeaders;
844 } else if (aTag == nsGkAtoms::sup && mSettings.GetStructs() &&
845 !currentNodeIsConverted) {
846 Write(u"^"_ns);
847 } else if (aTag == nsGkAtoms::sub && mSettings.GetStructs() &&
848 !currentNodeIsConverted) {
849 Write(u"_"_ns);
850 } else if (aTag == nsGkAtoms::code && mSettings.GetStructs() &&
851 !currentNodeIsConverted) {
852 Write(u"|"_ns);
853 } else if ((aTag == nsGkAtoms::strong || aTag == nsGkAtoms::b) &&
854 mSettings.GetStructs() && !currentNodeIsConverted) {
855 Write(u"*"_ns);
856 } else if ((aTag == nsGkAtoms::em || aTag == nsGkAtoms::i) &&
857 mSettings.GetStructs() && !currentNodeIsConverted) {
858 Write(u"/"_ns);
859 } else if (aTag == nsGkAtoms::u && mSettings.GetStructs() &&
860 !currentNodeIsConverted) {
861 Write(u"_"_ns);
864 /* Container elements are always block elements, so we shouldn't
865 output any whitespace immediately after the container tag even if
866 there's extra whitespace there because the HTML is pretty-printed
867 or something. To ensure that happens, tell the serializer we're
868 already in whitespace so it won't output more. */
869 mInWhitespace = true;
872 nsresult nsPlainTextSerializer::DoCloseContainer(const nsAtom* aTag) {
873 if (IsIgnorableRubyAnnotation(aTag)) {
874 mIgnoredChildNodeLevel--;
875 return NS_OK;
877 if (IsIgnorableScriptOrStyle(mElement)) {
878 mIgnoredChildNodeLevel--;
879 return NS_OK;
882 if (mSettings.HasFlag(nsIDocumentEncoder::OutputForPlainTextClipboardCopy)) {
883 if (DoOutput() && IsElementPreformatted() &&
884 IsCssBlockLevelElement(mElement)) {
885 // If we're closing a preformatted block element, output a line break
886 // when we find a new container.
887 mPreformattedBlockBoundary = true;
891 if (mSettings.HasFlag(nsIDocumentEncoder::OutputRaw)) {
892 // Raw means raw. Don't even think about doing anything fancy
893 // here like indenting, adding line breaks or any other
894 // characters such as list item bullets, quote characters
895 // around <q>, etc.
897 return NS_OK;
900 if (mTagStackIndex > 0) {
901 --mTagStackIndex;
904 if (mTagStackIndex >= mIgnoreAboveIndex) {
905 if (mTagStackIndex == mIgnoreAboveIndex) {
906 // We're dealing with the close tag whose matching
907 // open tag had set the mIgnoreAboveIndex value.
908 // Reset mIgnoreAboveIndex before discarding this tag.
909 mIgnoreAboveIndex = (uint32_t)kNotFound;
911 return NS_OK;
914 MOZ_ASSERT(mOutputManager);
916 // End current line if we're ending a block level tag
917 if ((aTag == nsGkAtoms::body) || (aTag == nsGkAtoms::html)) {
918 // We want the output to end with a new line,
919 // but in preformatted areas like text fields,
920 // we can't emit newlines that weren't there.
921 // So add the newline only in the case of formatted output.
922 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) {
923 EnsureVerticalSpace(0);
924 } else {
925 mOutputManager->Flush(mCurrentLine);
927 // We won't want to do anything with these in formatted mode either,
928 // so just return now:
929 return NS_OK;
932 // Keep this in sync with DoOpenContainer!
933 if (!DoOutput()) {
934 return NS_OK;
937 if (aTag == nsGkAtoms::tr) {
938 PopBool(mHasWrittenCellsForRow);
939 // Should always end a line, but get no more whitespace
940 if (mFloatingLines < 0) mFloatingLines = 0;
941 mLineBreakDue = true;
942 } else if (((aTag == nsGkAtoms::li) || (aTag == nsGkAtoms::dt)) &&
943 mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) {
944 // Items that should always end a line, but get no more whitespace
945 if (mFloatingLines < 0) mFloatingLines = 0;
946 mLineBreakDue = true;
947 } else if (aTag == nsGkAtoms::pre) {
948 mFloatingLines = GetLastBool(mIsInCiteBlockquote) ? 0 : 1;
949 mLineBreakDue = true;
950 } else if (aTag == nsGkAtoms::ul) {
951 mOutputManager->Flush(mCurrentLine);
952 mCurrentLine.mIndentation.mLength -= kIndentSizeList;
953 --mULCount;
954 if (!IsInOlOrUl()) {
955 mFloatingLines = 1;
956 mLineBreakDue = true;
958 } else if (aTag == nsGkAtoms::ol) {
959 mOutputManager->Flush(mCurrentLine); // Doing this after decreasing
960 // OLStackIndex would be wrong.
961 mCurrentLine.mIndentation.mLength -= kIndentSizeList;
962 MOZ_ASSERT(!mOLStack.IsEmpty(), "Wrong OLStack level!");
963 mOLStack.RemoveLastElement();
964 if (!IsInOlOrUl()) {
965 mFloatingLines = 1;
966 mLineBreakDue = true;
968 } else if (aTag == nsGkAtoms::dl) {
969 mFloatingLines = 1;
970 mLineBreakDue = true;
971 } else if (aTag == nsGkAtoms::dd) {
972 mOutputManager->Flush(mCurrentLine);
973 mCurrentLine.mIndentation.mLength -= kIndentSizeDD;
974 } else if (aTag == nsGkAtoms::span) {
975 NS_ASSERTION(mSpanLevel, "Span level will be negative!");
976 --mSpanLevel;
977 } else if (aTag == nsGkAtoms::div) {
978 if (mFloatingLines < 0) mFloatingLines = 0;
979 mLineBreakDue = true;
980 } else if (aTag == nsGkAtoms::blockquote) {
981 mOutputManager->Flush(mCurrentLine); // Is this needed?
983 // Pop
984 bool isInCiteBlockquote = PopBool(mIsInCiteBlockquote);
986 if (isInCiteBlockquote) {
987 NS_ASSERTION(mCurrentLine.mCiteQuoteLevel,
988 "CiteQuote level will be negative!");
989 mCurrentLine.mCiteQuoteLevel--;
990 mFloatingLines = 0;
991 mHasWrittenCiteBlockquote = true;
992 } else {
993 mCurrentLine.mIndentation.mLength -= kTabSize;
994 mFloatingLines = 1;
996 mLineBreakDue = true;
997 } else if (aTag == nsGkAtoms::q) {
998 Write(u"\""_ns);
999 } else if (IsCssBlockLevelElement(mElement)) {
1000 // All other blocks get 1 vertical space after them
1001 // in formatted mode, otherwise 0.
1002 // This is hard. Sometimes 0 is a better number, but
1003 // how to know?
1004 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) {
1005 EnsureVerticalSpace(1);
1006 } else {
1007 if (mFloatingLines < 0) mFloatingLines = 0;
1008 mLineBreakDue = true;
1012 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) {
1013 CloseContainerForOutputFormatted(aTag);
1016 return NS_OK;
1019 void nsPlainTextSerializer::CloseContainerForOutputFormatted(
1020 const nsAtom* aTag) {
1021 const bool currentNodeIsConverted = IsCurrentNodeConverted();
1023 if (aTag == nsGkAtoms::h1 || aTag == nsGkAtoms::h2 || aTag == nsGkAtoms::h3 ||
1024 aTag == nsGkAtoms::h4 || aTag == nsGkAtoms::h5 || aTag == nsGkAtoms::h6) {
1025 using HeaderStrategy = Settings::HeaderStrategy;
1026 if ((mSettings.GetHeaderStrategy() ==
1027 HeaderStrategy::kIndentIncreasedWithHeaderLevel) ||
1028 (mSettings.GetHeaderStrategy() ==
1029 HeaderStrategy::kNumberHeadingsAndIndentSlightly)) {
1030 mCurrentLine.mIndentation.mLength -= kIndentSizeHeaders;
1032 if (mSettings.GetHeaderStrategy() ==
1033 HeaderStrategy::kIndentIncreasedWithHeaderLevel) {
1034 for (int32_t i = HeaderLevel(aTag); i > 1; i--) {
1035 // for h(x), run x-1 times
1036 mCurrentLine.mIndentation.mLength -= kIndentIncrementHeaders;
1039 EnsureVerticalSpace(1);
1040 } else if (aTag == nsGkAtoms::a && !currentNodeIsConverted) {
1041 nsAutoString url;
1042 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::href, url)) &&
1043 !url.IsEmpty()) {
1044 nsAutoString temp;
1045 temp.AssignLiteral(" <");
1046 temp += url;
1047 temp.Append(char16_t('>'));
1048 Write(temp);
1050 } else if ((aTag == nsGkAtoms::sup || aTag == nsGkAtoms::sub) &&
1051 mSettings.GetStructs() && !currentNodeIsConverted) {
1052 Write(kSpace);
1053 } else if (aTag == nsGkAtoms::code && mSettings.GetStructs() &&
1054 !currentNodeIsConverted) {
1055 Write(u"|"_ns);
1056 } else if ((aTag == nsGkAtoms::strong || aTag == nsGkAtoms::b) &&
1057 mSettings.GetStructs() && !currentNodeIsConverted) {
1058 Write(u"*"_ns);
1059 } else if ((aTag == nsGkAtoms::em || aTag == nsGkAtoms::i) &&
1060 mSettings.GetStructs() && !currentNodeIsConverted) {
1061 Write(u"/"_ns);
1062 } else if (aTag == nsGkAtoms::u && mSettings.GetStructs() &&
1063 !currentNodeIsConverted) {
1064 Write(u"_"_ns);
1068 bool nsPlainTextSerializer::MustSuppressLeaf() const {
1069 if (mIgnoredChildNodeLevel > 0) {
1070 return true;
1073 if ((mTagStackIndex > 1 &&
1074 mTagStack[mTagStackIndex - 2] == nsGkAtoms::select) ||
1075 (mTagStackIndex > 0 &&
1076 mTagStack[mTagStackIndex - 1] == nsGkAtoms::select)) {
1077 // Don't output the contents of SELECT elements;
1078 // Might be nice, eventually, to output just the selected element.
1079 // Read more in bug 31994.
1080 return true;
1083 return false;
1086 void nsPlainTextSerializer::DoAddText() { DoAddText(true, u""_ns); }
1088 void nsPlainTextSerializer::DoAddText(bool aIsLineBreak,
1089 const nsAString& aText) {
1090 // If we don't want any output, just return
1091 if (!DoOutput()) {
1092 return;
1095 if (!aIsLineBreak) {
1096 // Make sure to reset this, since it's no longer true.
1097 mHasWrittenCiteBlockquote = false;
1100 if (mLineBreakDue) EnsureVerticalSpace(mFloatingLines);
1102 if (MustSuppressLeaf()) {
1103 return;
1106 if (aIsLineBreak) {
1107 // The only times we want to pass along whitespace from the original
1108 // html source are if we're forced into preformatted mode via flags,
1109 // or if we're prettyprinting and we're inside a <pre>.
1110 // Otherwise, either we're collapsing to minimal text, or we're
1111 // prettyprinting to mimic the html format, and in neither case
1112 // does the formatting of the html source help us.
1113 if (mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted) ||
1114 (mPreFormattedMail && !mSettings.GetWrapColumn()) ||
1115 IsElementPreformatted()) {
1116 EnsureVerticalSpace(mEmptyLines + 1);
1117 } else if (!mInWhitespace) {
1118 Write(kSpace);
1119 mInWhitespace = true;
1121 return;
1124 Write(aText);
1127 void CreateLineOfDashes(nsAString& aResult, const uint32_t aWrapColumn) {
1128 MOZ_ASSERT(aResult.IsEmpty());
1130 const uint32_t width = (aWrapColumn > 0 ? aWrapColumn : 25);
1131 while (aResult.Length() < width) {
1132 aResult.Append(char16_t('-'));
1136 nsresult nsPlainTextSerializer::DoAddLeaf(const nsAtom* aTag) {
1137 mPreformattedBlockBoundary = false;
1139 if (!DoOutput()) {
1140 return NS_OK;
1143 if (mLineBreakDue) EnsureVerticalSpace(mFloatingLines);
1145 if (MustSuppressLeaf()) {
1146 return NS_OK;
1149 if (aTag == nsGkAtoms::br) {
1150 // Another egregious editor workaround, see bug 38194:
1151 // ignore the bogus br tags that the editor sticks here and there.
1152 // FYI: `brElement` may be `nullptr` if the element is <br> element
1153 // of non-HTML element.
1154 // XXX Do we need to call `EnsureVerticalSpace()` when the <br> element
1155 // is not an HTML element?
1156 HTMLBRElement* brElement = HTMLBRElement::FromNodeOrNull(mElement);
1157 if (!brElement || !brElement->IsPaddingForEmptyLastLine()) {
1158 EnsureVerticalSpace(mEmptyLines + 1);
1160 } else if (aTag == nsGkAtoms::hr &&
1161 mSettings.HasFlag(nsIDocumentEncoder::OutputFormatted)) {
1162 EnsureVerticalSpace(0);
1164 // Make a line of dashes as wide as the wrap width
1165 // XXX honoring percentage would be nice
1166 nsAutoString line;
1167 CreateLineOfDashes(line, mSettings.GetWrapColumn());
1168 Write(line);
1170 EnsureVerticalSpace(0);
1171 } else if (aTag == nsGkAtoms::img) {
1172 /* Output (in decreasing order of preference)
1173 alt, title or nothing */
1174 // See <http://www.w3.org/TR/REC-html40/struct/objects.html#edef-IMG>
1175 nsAutoString imageDescription;
1176 if (NS_SUCCEEDED(GetAttributeValue(nsGkAtoms::alt, imageDescription))) {
1177 // If the alt attribute has an empty value (|alt=""|), output nothing
1178 } else if (NS_SUCCEEDED(
1179 GetAttributeValue(nsGkAtoms::title, imageDescription)) &&
1180 !imageDescription.IsEmpty()) {
1181 imageDescription = u" ["_ns + imageDescription + u"] "_ns;
1184 Write(imageDescription);
1187 return NS_OK;
1191 * Adds as many newline as necessary to get |aNumberOfRows| empty lines
1193 * aNumberOfRows = -1 : Being in the middle of some line of text
1194 * aNumberOfRows = 0 : Being at the start of a line
1195 * aNumberOfRows = n>0 : Having n empty lines before the current line.
1197 void nsPlainTextSerializer::EnsureVerticalSpace(const int32_t aNumberOfRows) {
1198 // If we have something in the indent we probably want to output
1199 // it and it's not included in the count for empty lines so we don't
1200 // realize that we should start a new line.
1201 if (aNumberOfRows >= 0 && !mCurrentLine.mIndentation.mHeader.IsEmpty()) {
1202 EndLine(false);
1203 mInWhitespace = true;
1206 while (mEmptyLines < aNumberOfRows) {
1207 EndLine(false);
1208 mInWhitespace = true;
1210 mLineBreakDue = false;
1211 mFloatingLines = -1;
1214 void nsPlainTextSerializer::OutputManager::Flush(CurrentLine& aCurrentLine) {
1215 if (!aCurrentLine.mContent.IsEmpty()) {
1216 aCurrentLine.MaybeReplaceNbspsInContent(mFlags);
1218 Append(aCurrentLine, StripTrailingWhitespaces::kNo);
1220 aCurrentLine.ResetContentAndIndentationHeader();
1224 static bool IsSpaceStuffable(const char16_t* s) {
1225 return (s[0] == '>' || s[0] == ' ' || s[0] == kNBSP ||
1226 NS_strncmp(s, u"From ", 5) == 0);
1229 void nsPlainTextSerializer::MaybeWrapAndOutputCompleteLines() {
1230 if (!mSettings.MayWrap()) {
1231 return;
1234 // Yes, wrap!
1235 // The "+4" is to avoid wrap lines that only would be a couple
1236 // of letters too long. We give this bonus only if the
1237 // wrapcolumn is more than 20.
1238 const uint32_t wrapColumn = mSettings.GetWrapColumn();
1239 uint32_t bonuswidth = (wrapColumn > 20) ? 4 : 0;
1240 while (!mCurrentLine.mContent.IsEmpty()) {
1241 const uint32_t prefixwidth = mCurrentLine.DeterminePrefixWidth();
1242 // The width of the line as it will appear on the screen (approx.).
1243 const uint32_t currentLineContentWidth =
1244 GetUnicharStringWidth(mCurrentLine.mContent);
1245 if (currentLineContentWidth + prefixwidth <= wrapColumn + bonuswidth) {
1246 break;
1249 const int32_t goodSpace =
1250 mCurrentLine.FindWrapIndexForContent(wrapColumn, mUseLineBreaker);
1252 const int32_t contentLength = mCurrentLine.mContent.Length();
1253 if (goodSpace <= 0 || goodSpace >= contentLength) {
1254 // Nothing to do. Hopefully we get more data later to use for a place to
1255 // break line.
1256 break;
1258 // Found a place to break
1259 // -1 (trim a char at the break position) only if the line break was a
1260 // space.
1261 nsAutoString restOfContent;
1262 if (nsCRT::IsAsciiSpace(mCurrentLine.mContent.CharAt(goodSpace))) {
1263 mCurrentLine.mContent.Right(restOfContent, contentLength - goodSpace - 1);
1264 } else {
1265 mCurrentLine.mContent.Right(restOfContent, contentLength - goodSpace);
1267 // if breaker was U+0020, it has to consider for delsp=yes support
1268 const bool breakBySpace = mCurrentLine.mContent.CharAt(goodSpace) == ' ';
1269 mCurrentLine.mContent.Truncate(goodSpace);
1270 EndLine(true, breakBySpace);
1271 mCurrentLine.mContent.Truncate();
1272 // Space stuffing a la RFC 2646 (format=flowed)
1273 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) {
1274 mCurrentLine.mSpaceStuffed = !restOfContent.IsEmpty() &&
1275 IsSpaceStuffable(restOfContent.get()) &&
1276 // We space-stuff quoted lines anyway
1277 mCurrentLine.mCiteQuoteLevel == 0;
1279 mCurrentLine.mContent.Append(restOfContent);
1280 mEmptyLines = -1;
1285 * This function adds a piece of text to the current stored line. If we are
1286 * wrapping text and the stored line will become too long, a suitable
1287 * location to wrap will be found and the line that's complete will be
1288 * output.
1290 void nsPlainTextSerializer::AddToLine(const char16_t* aLineFragment,
1291 int32_t aLineFragmentLength) {
1292 if (mLineBreakDue) EnsureVerticalSpace(mFloatingLines);
1294 if (mCurrentLine.mContent.IsEmpty()) {
1295 if (0 == aLineFragmentLength) {
1296 return;
1299 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) {
1300 // Space stuffing a la RFC 2646 (format=flowed).
1301 // We space-stuff quoted lines anyway
1302 mCurrentLine.mSpaceStuffed =
1303 IsSpaceStuffable(aLineFragment) && mCurrentLine.mCiteQuoteLevel == 0;
1305 mEmptyLines = -1;
1308 mCurrentLine.mContent.Append(aLineFragment, aLineFragmentLength);
1310 MaybeWrapAndOutputCompleteLines();
1313 // The signature separator (RFC 2646).
1314 const char kSignatureSeparator[] = "-- ";
1316 // The OpenPGP dash-escaped signature separator in inline
1317 // signed messages according to the OpenPGP standard (RFC 2440).
1318 const char kDashEscapedSignatureSeparator[] = "- -- ";
1320 static bool IsSignatureSeparator(const nsAString& aString) {
1321 return aString.EqualsLiteral(kSignatureSeparator) ||
1322 aString.EqualsLiteral(kDashEscapedSignatureSeparator);
1326 * Outputs the contents of mCurrentLine.mContent, and resets line
1327 * specific variables. Also adds an indentation and prefix if there is one
1328 * specified. Strips ending spaces from the line if it isn't preformatted.
1330 void nsPlainTextSerializer::EndLine(bool aSoftLineBreak, bool aBreakBySpace) {
1331 if (aSoftLineBreak && mCurrentLine.mContent.IsEmpty()) {
1332 // No meaning
1333 return;
1336 /* In non-preformatted mode, remove spaces from the end of the line for
1337 * format=flowed compatibility. Don't do this for these special cases:
1338 * "-- ", the signature separator (RFC 2646) shouldn't be touched and
1339 * "- -- ", the OpenPGP dash-escaped signature separator in inline
1340 * signed messages according to the OpenPGP standard (RFC 2440).
1342 if (!mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted) &&
1343 (aSoftLineBreak || !IsSignatureSeparator(mCurrentLine.mContent))) {
1344 mCurrentLine.mContent.Trim(" ", false, true, false);
1347 if (aSoftLineBreak &&
1348 mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed) &&
1349 !mCurrentLine.mIndentation.mLength) {
1350 // Add the soft part of the soft linebreak (RFC 2646 4.1)
1351 // We only do this when there is no indentation since format=flowed
1352 // lines and indentation doesn't work well together.
1354 // If breaker character is ASCII space with RFC 3676 support (delsp=yes),
1355 // add twice space.
1356 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatDelSp) &&
1357 aBreakBySpace) {
1358 mCurrentLine.mContent.AppendLiteral(" ");
1359 } else {
1360 mCurrentLine.mContent.Append(char16_t(' '));
1364 if (aSoftLineBreak) {
1365 mEmptyLines = 0;
1366 } else {
1367 // Hard break
1368 if (mCurrentLine.HasContentOrIndentationHeader()) {
1369 mEmptyLines = 0;
1370 } else {
1371 mEmptyLines++;
1375 MOZ_ASSERT(mOutputManager);
1377 mCurrentLine.MaybeReplaceNbspsInContent(mSettings.GetFlags());
1379 // If we don't have anything "real" to output we have to
1380 // make sure the indent doesn't end in a space since that
1381 // would trick a format=flowed-aware receiver.
1382 mOutputManager->Append(mCurrentLine,
1383 OutputManager::StripTrailingWhitespaces::kMaybe);
1384 mOutputManager->AppendLineBreak();
1385 mCurrentLine.ResetContentAndIndentationHeader();
1386 mInWhitespace = true;
1387 mLineBreakDue = false;
1388 mFloatingLines = -1;
1392 * Creates the calculated and stored indent and text in the indentation. That is
1393 * quote chars and numbers for numbered lists and such.
1395 void nsPlainTextSerializer::CurrentLine::CreateQuotesAndIndent(
1396 nsAString& aResult) const {
1397 // Put the mail quote "> " chars in, if appropriate:
1398 if (mCiteQuoteLevel > 0) {
1399 nsAutoString quotes;
1400 for (int i = 0; i < mCiteQuoteLevel; i++) {
1401 quotes.Append(char16_t('>'));
1403 if (!mContent.IsEmpty()) {
1404 /* Better don't output a space here, if the line is empty,
1405 in case a receiving format=flowed-aware UA thinks, this were a flowed
1406 line, which it isn't - it's just empty. (Flowed lines may be joined
1407 with the following one, so the empty line may be lost completely.) */
1408 quotes.Append(char16_t(' '));
1410 aResult = quotes;
1413 // Indent if necessary
1414 int32_t indentwidth = mIndentation.mLength - mIndentation.mHeader.Length();
1415 if (mSpaceStuffed) {
1416 indentwidth += 1;
1419 // Don't make empty lines look flowed
1420 if (indentwidth > 0 && HasContentOrIndentationHeader()) {
1421 nsAutoString spaces;
1422 for (int i = 0; i < indentwidth; ++i) {
1423 spaces.Append(char16_t(' '));
1425 aResult += spaces;
1428 if (!mIndentation.mHeader.IsEmpty()) {
1429 aResult += mIndentation.mHeader;
1433 static bool IsLineFeedCarriageReturnBlankOrTab(char16_t c) {
1434 return ('\n' == c || '\r' == c || ' ' == c || '\t' == c);
1437 static void ReplaceVisiblyTrailingNbsps(nsAString& aString) {
1438 const int32_t totLen = aString.Length();
1439 for (int32_t i = totLen - 1; i >= 0; i--) {
1440 char16_t c = aString[i];
1441 if (IsLineFeedCarriageReturnBlankOrTab(c)) {
1442 continue;
1444 if (kNBSP == c) {
1445 aString.Replace(i, 1, ' ');
1446 } else {
1447 break;
1452 void nsPlainTextSerializer::ConvertToLinesAndOutput(const nsAString& aString) {
1453 const int32_t totLen = aString.Length();
1454 int32_t newline{0};
1456 // Put the mail quote "> " chars in, if appropriate.
1457 // Have to put it in before every line.
1458 int32_t bol = 0;
1459 while (bol < totLen) {
1460 bool outputLineBreak = false;
1461 bool spacesOnly = true;
1463 // Find one of '\n' or '\r' using iterators since nsAString
1464 // doesn't have the old FindCharInSet function.
1465 nsAString::const_iterator iter;
1466 aString.BeginReading(iter);
1467 nsAString::const_iterator done_searching;
1468 aString.EndReading(done_searching);
1469 iter.advance(bol);
1470 int32_t new_newline = bol;
1471 newline = kNotFound;
1472 while (iter != done_searching) {
1473 if ('\n' == *iter || '\r' == *iter) {
1474 newline = new_newline;
1475 break;
1477 if (' ' != *iter) {
1478 spacesOnly = false;
1480 ++new_newline;
1481 ++iter;
1484 // Done searching
1485 nsAutoString stringpart;
1486 if (newline == kNotFound) {
1487 // No new lines.
1488 stringpart.Assign(Substring(aString, bol, totLen - bol));
1489 if (!stringpart.IsEmpty()) {
1490 char16_t lastchar = stringpart.Last();
1491 mInWhitespace = IsLineFeedCarriageReturnBlankOrTab(lastchar);
1493 mEmptyLines = -1;
1494 bol = totLen;
1495 } else {
1496 // There is a newline
1497 stringpart.Assign(Substring(aString, bol, newline - bol));
1498 mInWhitespace = true;
1499 outputLineBreak = true;
1500 mEmptyLines = 0;
1501 bol = newline + 1;
1502 if ('\r' == *iter && bol < totLen && '\n' == *++iter) {
1503 // There was a CRLF in the input. This used to be illegal and
1504 // stripped by the parser. Apparently not anymore. Let's skip
1505 // over the LF.
1506 bol++;
1510 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) {
1511 if ((outputLineBreak || !spacesOnly) && // bugs 261467,125928
1512 !IsQuotedLine(stringpart) && !IsSignatureSeparator(stringpart)) {
1513 stringpart.Trim(" ", false, true, true);
1515 mCurrentLine.mSpaceStuffed =
1516 IsSpaceStuffable(stringpart.get()) && !IsQuotedLine(stringpart);
1518 mCurrentLine.mContent.Append(stringpart);
1520 mCurrentLine.MaybeReplaceNbspsInContent(mSettings.GetFlags());
1522 mOutputManager->Append(mCurrentLine,
1523 OutputManager::StripTrailingWhitespaces::kNo);
1524 if (outputLineBreak) {
1525 mOutputManager->AppendLineBreak();
1528 mCurrentLine.ResetContentAndIndentationHeader();
1531 #ifdef DEBUG_wrapping
1532 printf("No wrapping: newline is %d, totLen is %d\n", newline, totLen);
1533 #endif
1537 * Write a string. This is the highlevel function to use to get text output.
1538 * By using AddToLine, Output, EndLine and other functions it handles quotation,
1539 * line wrapping, indentation, whitespace compression and other things.
1541 void nsPlainTextSerializer::Write(const nsAString& aStr) {
1542 // XXX Copy necessary to use nsString methods and gain
1543 // access to underlying buffer
1544 nsAutoString str(aStr);
1546 #ifdef DEBUG_wrapping
1547 printf("Write(%s): wrap col = %d\n", NS_ConvertUTF16toUTF8(str).get(),
1548 mSettings.GetWrapColumn());
1549 #endif
1551 const int32_t totLen = str.Length();
1553 // If the string is empty, do nothing:
1554 if (totLen <= 0) return;
1556 // For Flowed text change nbsp-ses to spaces at end of lines to allow them
1557 // to be cut off along with usual spaces if required. (bug #125928)
1558 if (mSettings.HasFlag(nsIDocumentEncoder::OutputFormatFlowed)) {
1559 ReplaceVisiblyTrailingNbsps(str);
1562 // We have two major codepaths here. One that does preformatted text and one
1563 // that does normal formatted text. The one for preformatted text calls
1564 // Output directly while the other code path goes through AddToLine.
1565 if ((mPreFormattedMail && !mSettings.GetWrapColumn()) ||
1566 (IsElementPreformatted() && !mPreFormattedMail) ||
1567 (mSpanLevel > 0 && mEmptyLines >= 0 && IsQuotedLine(str))) {
1568 // No intelligent wrapping.
1570 // This mustn't be mixed with intelligent wrapping without clearing
1571 // the mCurrentLine.mContent buffer before!!!
1572 NS_ASSERTION(mCurrentLine.mContent.IsEmpty() ||
1573 (IsElementPreformatted() && !mPreFormattedMail),
1574 "Mixed wrapping data and nonwrapping data on the same line");
1575 MOZ_ASSERT(mOutputManager);
1577 if (!mCurrentLine.mContent.IsEmpty()) {
1578 mOutputManager->Flush(mCurrentLine);
1581 ConvertToLinesAndOutput(str);
1582 return;
1585 // Intelligent handling of text
1586 // If needed, strip out all "end of lines"
1587 // and multiple whitespace between words
1588 int32_t nextpos;
1589 const char16_t* offsetIntoBuffer = nullptr;
1591 int32_t bol = 0;
1592 while (bol < totLen) { // Loop over lines
1593 // Find a place where we may have to do whitespace compression
1594 nextpos = str.FindCharInSet(u" \t\n\r", bol);
1595 #ifdef DEBUG_wrapping
1596 nsAutoString remaining;
1597 str.Right(remaining, totLen - bol);
1598 foo = ToNewCString(remaining);
1599 // printf("Next line: bol = %d, newlinepos = %d, totLen = %d, "
1600 // "string = '%s'\n", bol, nextpos, totLen, foo);
1601 free(foo);
1602 #endif
1604 if (nextpos == kNotFound) {
1605 // The rest of the string
1606 offsetIntoBuffer = str.get() + bol;
1607 AddToLine(offsetIntoBuffer, totLen - bol);
1608 bol = totLen;
1609 mInWhitespace = false;
1610 } else {
1611 // There's still whitespace left in the string
1612 if (nextpos != 0 && (nextpos + 1) < totLen) {
1613 offsetIntoBuffer = str.get() + nextpos;
1614 // skip '\n' if it is between CJ chars
1615 if (offsetIntoBuffer[0] == '\n' && IS_CJ_CHAR(offsetIntoBuffer[-1]) &&
1616 IS_CJ_CHAR(offsetIntoBuffer[1])) {
1617 offsetIntoBuffer = str.get() + bol;
1618 AddToLine(offsetIntoBuffer, nextpos - bol);
1619 bol = nextpos + 1;
1620 continue;
1623 // If we're already in whitespace and not preformatted, just skip it:
1624 if (mInWhitespace && (nextpos == bol) && !mPreFormattedMail &&
1625 !mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted)) {
1626 // Skip whitespace
1627 bol++;
1628 continue;
1631 if (nextpos == bol) {
1632 // Note that we are in whitespace.
1633 mInWhitespace = true;
1634 offsetIntoBuffer = str.get() + nextpos;
1635 AddToLine(offsetIntoBuffer, 1);
1636 bol++;
1637 continue;
1640 mInWhitespace = true;
1642 offsetIntoBuffer = str.get() + bol;
1643 if (mPreFormattedMail ||
1644 mSettings.HasFlag(nsIDocumentEncoder::OutputPreformatted)) {
1645 // Preserve the real whitespace character
1646 nextpos++;
1647 AddToLine(offsetIntoBuffer, nextpos - bol);
1648 bol = nextpos;
1649 } else {
1650 // Replace the whitespace with a space
1651 AddToLine(offsetIntoBuffer, nextpos - bol);
1652 AddToLine(kSpace.get(), 1);
1653 bol = nextpos + 1; // Let's eat the whitespace
1656 } // Continue looping over the string
1660 * Gets the value of an attribute in a string. If the function returns
1661 * NS_ERROR_NOT_AVAILABLE, there was none such attribute specified.
1663 nsresult nsPlainTextSerializer::GetAttributeValue(const nsAtom* aName,
1664 nsString& aValueRet) const {
1665 if (mElement) {
1666 if (mElement->GetAttr(aName, aValueRet)) {
1667 return NS_OK;
1671 return NS_ERROR_NOT_AVAILABLE;
1675 * Returns true, if the element was inserted by Moz' TXT->HTML converter.
1676 * In this case, we should ignore it.
1678 bool nsPlainTextSerializer::IsCurrentNodeConverted() const {
1679 nsAutoString value;
1680 nsresult rv = GetAttributeValue(nsGkAtoms::_class, value);
1681 return (NS_SUCCEEDED(rv) &&
1682 (StringBeginsWith(value, u"moz-txt"_ns,
1683 nsASCIICaseInsensitiveStringComparator) ||
1684 StringBeginsWith(value, u"\"moz-txt"_ns,
1685 nsASCIICaseInsensitiveStringComparator)));
1688 // static
1689 nsAtom* nsPlainTextSerializer::GetIdForContent(nsIContent* aContent) {
1690 if (!aContent->IsHTMLElement()) {
1691 return nullptr;
1694 nsAtom* localName = aContent->NodeInfo()->NameAtom();
1695 return localName->IsStatic() ? localName : nullptr;
1698 bool nsPlainTextSerializer::IsElementPreformatted() const {
1699 return !mPreformatStack.empty() && mPreformatStack.top();
1702 bool nsPlainTextSerializer::IsElementPreformatted(Element* aElement) {
1703 RefPtr<const ComputedStyle> computedStyle =
1704 nsComputedDOMStyle::GetComputedStyleNoFlush(aElement);
1705 if (computedStyle) {
1706 const nsStyleText* textStyle = computedStyle->StyleText();
1707 return textStyle->WhiteSpaceOrNewlineIsSignificant();
1709 // Fall back to looking at the tag, in case there is no style information.
1710 return GetIdForContent(aElement) == nsGkAtoms::pre;
1713 bool nsPlainTextSerializer::IsCssBlockLevelElement(Element* aElement) {
1714 RefPtr<const ComputedStyle> computedStyle =
1715 nsComputedDOMStyle::GetComputedStyleNoFlush(aElement);
1716 if (computedStyle) {
1717 const nsStyleDisplay* displayStyle = computedStyle->StyleDisplay();
1718 return displayStyle->IsBlockOutsideStyle();
1720 // Fall back to looking at the tag, in case there is no style information.
1721 return nsContentUtils::IsHTMLBlockLevelElement(aElement);
1725 * This method is required only to identify LI's inside OL.
1726 * Returns TRUE if we are inside an OL tag and FALSE otherwise.
1728 bool nsPlainTextSerializer::IsInOL() const {
1729 int32_t i = mTagStackIndex;
1730 while (--i >= 0) {
1731 if (mTagStack[i] == nsGkAtoms::ol) return true;
1732 if (mTagStack[i] == nsGkAtoms::ul) {
1733 // If a UL is reached first, LI belongs the UL nested in OL.
1734 return false;
1737 // We may reach here for orphan LI's.
1738 return false;
1741 bool nsPlainTextSerializer::IsInOlOrUl() const {
1742 return (mULCount > 0) || !mOLStack.IsEmpty();
1746 @return 0 = no header, 1 = h1, ..., 6 = h6
1748 int32_t HeaderLevel(const nsAtom* aTag) {
1749 if (aTag == nsGkAtoms::h1) {
1750 return 1;
1752 if (aTag == nsGkAtoms::h2) {
1753 return 2;
1755 if (aTag == nsGkAtoms::h3) {
1756 return 3;
1758 if (aTag == nsGkAtoms::h4) {
1759 return 4;
1761 if (aTag == nsGkAtoms::h5) {
1762 return 5;
1764 if (aTag == nsGkAtoms::h6) {
1765 return 6;
1767 return 0;
1770 /* These functions define the column width of an ISO 10646 character
1771 * as follows:
1773 * - The null character (U+0000) has a column width of 0.
1775 * - Other C0/C1 control characters and DEL will lead to a return
1776 * value of -1.
1778 * - Non-spacing and enclosing combining characters (general
1779 * category code Mn or Me in the Unicode database) have a
1780 * column width of 0.
1782 * - Spacing characters in the East Asian Wide (W) or East Asian
1783 * FullWidth (F) category as defined in Unicode Technical
1784 * Report #11 have a column width of 2.
1786 * - All remaining characters (including all printable
1787 * ISO 8859-1 and WGL4 characters, Unicode control characters,
1788 * etc.) have a column width of 1.
1791 int32_t GetUnicharWidth(char32_t aCh) {
1792 /* test for 8-bit control characters */
1793 if (aCh == 0) {
1794 return 0;
1796 if (aCh < 32 || (aCh >= 0x7f && aCh < 0xa0)) {
1797 return -1;
1800 /* The first combining char in Unicode is U+0300 */
1801 if (aCh < 0x0300) {
1802 return 1;
1805 auto gc = unicode::GetGeneralCategory(aCh);
1806 if (gc == HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK ||
1807 gc == HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK) {
1808 return 0;
1811 /* if we arrive here, ucs is not a combining or C0/C1 control character */
1813 /* fast test for majority of non-wide scripts */
1814 if (aCh < 0x1100) {
1815 return 1;
1818 return intl::UnicodeProperties::IsEastAsianWidthFW(aCh) ? 2 : 1;
1821 int32_t GetUnicharStringWidth(Span<const char16_t> aString) {
1822 int32_t width = 0;
1823 for (auto iter = aString.begin(); iter != aString.end(); ++iter) {
1824 char32_t c = *iter;
1825 if (NS_IS_HIGH_SURROGATE(c) && (iter + 1) != aString.end() &&
1826 NS_IS_LOW_SURROGATE(*(iter + 1))) {
1827 c = SURROGATE_TO_UCS4(c, *++iter);
1829 const int32_t w = GetUnicharWidth(c);
1830 // Taking 1 as the width of non-printable character, for bug 94475.
1831 width += (w < 0 ? 1 : w);
1833 return width;