tdf#163956 calculate line height differently in FORMTEXT
[LibreOffice.git] / svtools / source / svhtml / parhtml.cxx
bloba5bffbd9dcbdd5e6259f5eb347e201dee2f4d8fd
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <comphelper/string.hxx>
21 #include <o3tl/safeint.hxx>
22 #include <o3tl/string_view.hxx>
23 #include <tools/stream.hxx>
24 #include <tools/debug.hxx>
25 #include <tools/color.hxx>
26 #include <rtl/ustrbuf.hxx>
27 #include <rtl/character.hxx>
28 #include <rtl/tencinfo.h>
29 #include <sal/log.hxx>
30 #include <tools/tenccvt.hxx>
31 #include <tools/datetime.hxx>
32 #include <unotools/datetime.hxx>
33 #include <svl/inettype.hxx>
34 #include <svl/lngmisc.hxx>
35 #include <com/sun/star/beans/PropertyAttribute.hpp>
36 #include <com/sun/star/document/XDocumentProperties.hpp>
38 #include <svtools/parhtml.hxx>
39 #include <svtools/htmltokn.h>
40 #include <svtools/htmlkywd.hxx>
42 #include <utility>
44 using namespace ::com::sun::star;
47 const sal_Int32 MAX_LEN( 1024 );
49 const sal_Int32 MAX_ENTITY_LEN( 8 );
52 // Tables to convert option values into strings
54 // <INPUT TYPE=xxx>
55 HTMLOptionEnum<HTMLInputType> const aInputTypeOptEnums[] =
57 { OOO_STRING_SVTOOLS_HTML_IT_text, HTMLInputType::Text },
58 { OOO_STRING_SVTOOLS_HTML_IT_password, HTMLInputType::Password },
59 { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTMLInputType::Checkbox },
60 { OOO_STRING_SVTOOLS_HTML_IT_radio, HTMLInputType::Radio },
61 { OOO_STRING_SVTOOLS_HTML_IT_range, HTMLInputType::Range },
62 { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTMLInputType::Scribble },
63 { OOO_STRING_SVTOOLS_HTML_IT_file, HTMLInputType::File },
64 { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTMLInputType::Hidden },
65 { OOO_STRING_SVTOOLS_HTML_IT_submit, HTMLInputType::Submit },
66 { OOO_STRING_SVTOOLS_HTML_IT_image, HTMLInputType::Image },
67 { OOO_STRING_SVTOOLS_HTML_IT_reset, HTMLInputType::Reset },
68 { OOO_STRING_SVTOOLS_HTML_IT_button, HTMLInputType::Button },
69 { nullptr, HTMLInputType(0) }
72 // <TABLE FRAME=xxx>
73 HTMLOptionEnum<HTMLTableFrame> const aTableFrameOptEnums[] =
75 { OOO_STRING_SVTOOLS_HTML_TF_void, HTMLTableFrame::Void },
76 { OOO_STRING_SVTOOLS_HTML_TF_above, HTMLTableFrame::Above },
77 { OOO_STRING_SVTOOLS_HTML_TF_below, HTMLTableFrame::Below },
78 { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTMLTableFrame::HSides },
79 { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTMLTableFrame::LHS },
80 { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTMLTableFrame::RHS },
81 { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTMLTableFrame::VSides },
82 { OOO_STRING_SVTOOLS_HTML_TF_box, HTMLTableFrame::Box },
83 { OOO_STRING_SVTOOLS_HTML_TF_border, HTMLTableFrame::Box },
84 { nullptr, HTMLTableFrame(0) }
87 // <TABLE RULES=xxx>
88 HTMLOptionEnum<HTMLTableRules> const aTableRulesOptEnums[] =
90 { OOO_STRING_SVTOOLS_HTML_TR_none, HTMLTableRules::NONE },
91 { OOO_STRING_SVTOOLS_HTML_TR_groups, HTMLTableRules::Groups },
92 { OOO_STRING_SVTOOLS_HTML_TR_rows, HTMLTableRules::Rows },
93 { OOO_STRING_SVTOOLS_HTML_TR_cols, HTMLTableRules::Cols },
94 { OOO_STRING_SVTOOLS_HTML_TR_all, HTMLTableRules::All },
95 { nullptr, HTMLTableRules(0) }
99 HTMLOption::HTMLOption( HtmlOptionId nTok, OUString _aToken,
100 OUString _aValue )
101 : aValue(std::move(_aValue))
102 , aToken(std::move(_aToken))
103 , nToken( nTok )
105 DBG_ASSERT( nToken>=HtmlOptionId::BOOL_START && nToken<HtmlOptionId::END,
106 "HTMLOption: unknown token" );
109 sal_uInt32 HTMLOption::GetNumber() const
111 DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START &&
112 nToken<HtmlOptionId::NUMBER_END) ||
113 (nToken>=HtmlOptionId::CONTEXT_START &&
114 nToken<HtmlOptionId::CONTEXT_END) ||
115 nToken==HtmlOptionId::VALUE,
116 "GetNumber: Option not numerical" );
117 OUString aTmp(comphelper::string::stripStart(aValue, ' '));
118 sal_Int32 nTmp = aTmp.toInt32();
119 return nTmp >= 0 ? static_cast<sal_uInt32>(nTmp) : 0;
122 sal_Int32 HTMLOption::GetSNumber() const
124 DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START && nToken<HtmlOptionId::NUMBER_END) ||
125 (nToken>=HtmlOptionId::CONTEXT_START && nToken<HtmlOptionId::CONTEXT_END),
126 "GetSNumber: Option not numerical" );
127 OUString aTmp(comphelper::string::stripStart(aValue, ' '));
128 return aTmp.toInt32();
131 void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers ) const
133 rNumbers.clear();
135 // This is a very simplified scanner: it only searches all
136 // numerals in the string.
137 bool bInNum = false;
138 sal_uInt32 nNum = 0;
139 for( sal_Int32 i=0; i<aValue.getLength(); i++ )
141 sal_Unicode c = aValue[ i ];
142 if( c>='0' && c<='9' )
144 nNum *= 10;
145 nNum += (c - '0');
146 bInNum = true;
148 else if( bInNum )
150 rNumbers.push_back( nNum );
151 bInNum = false;
152 nNum = 0;
155 if( bInNum )
157 rNumbers.push_back( nNum );
161 void HTMLOption::GetColor( Color& rColor ) const
163 DBG_ASSERT( (nToken>=HtmlOptionId::COLOR_START && nToken<HtmlOptionId::COLOR_END) || nToken==HtmlOptionId::SIZE,
164 "GetColor: Option is not a color." );
166 OUString aTmp(aValue.toAsciiLowerCase());
167 sal_uInt32 nColor = SAL_MAX_UINT32;
168 if (!aTmp.isEmpty() && aTmp[0] != '#')
169 nColor = GetHTMLColor(aTmp);
171 if( SAL_MAX_UINT32 == nColor )
173 nColor = 0;
174 sal_Int32 nPos = 0;
175 for (sal_uInt32 i=0; i<6; ++i)
177 // Whatever Netscape does to get color values,
178 // at maximum three characters < '0' are ignored.
179 sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
180 if( c < '0' )
182 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
183 if( c < '0' )
184 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
186 nColor *= 16;
187 if( c >= '0' && c <= '9' )
188 nColor += (c - '0');
189 else if( c >= 'a' && c <= 'f' )
190 nColor += (c + 0xa - 'a');
194 rColor.SetRed( static_cast<sal_uInt8>((nColor & 0x00ff0000) >> 16) );
195 rColor.SetGreen( static_cast<sal_uInt8>((nColor & 0x0000ff00) >> 8));
196 rColor.SetBlue( static_cast<sal_uInt8>(nColor & 0x000000ff) );
199 HTMLInputType HTMLOption::GetInputType() const
201 DBG_ASSERT( nToken==HtmlOptionId::TYPE, "GetInputType: Option not TYPE" );
202 return GetEnum( aInputTypeOptEnums, HTMLInputType::Text );
205 HTMLTableFrame HTMLOption::GetTableFrame() const
207 DBG_ASSERT( nToken==HtmlOptionId::FRAME, "GetTableFrame: Option not FRAME" );
208 return GetEnum( aTableFrameOptEnums );
211 HTMLTableRules HTMLOption::GetTableRules() const
213 DBG_ASSERT( nToken==HtmlOptionId::RULES, "GetTableRules: Option not RULES" );
214 return GetEnum( aTableRulesOptEnums );
217 HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
218 SvParser<HtmlTokenId>( rIn ),
219 bNewDoc(bReadNewDoc),
220 bIsInHeader(true),
221 bReadListing(false),
222 bReadXMP(false),
223 bReadPRE(false),
224 bReadTextArea(false),
225 bReadScript(false),
226 bReadStyle(false),
227 bEndTokenFound(false),
228 bPre_IgnoreNewPara(false),
229 bReadNextChar(false),
230 bReadComment(false),
231 nPre_LinePos(0),
232 mnPendingOffToken(HtmlTokenId::NONE)
234 //#i76649, default to UTF-8 for HTML unless we know differently
235 SetSrcEncoding(RTL_TEXTENCODING_UTF8);
238 HTMLParser::~HTMLParser()
242 void HTMLParser::SetNamespace(std::u16string_view rNamespace)
244 // Convert namespace alias to a prefix.
245 maNamespace = OUString::Concat(rNamespace) + ":";
248 namespace
250 class RefGuard
252 private:
253 HTMLParser& m_rParser;
254 public:
255 RefGuard(HTMLParser& rParser)
256 : m_rParser(rParser)
258 m_rParser.AddFirstRef();
261 ~RefGuard()
263 if (m_rParser.GetStatus() != SvParserState::Pending)
264 m_rParser.ReleaseRef(); // Parser not needed anymore
269 SvParserState HTMLParser::CallParser()
271 eState = SvParserState::Working;
272 nNextCh = GetNextChar();
273 SaveState( HtmlTokenId::NONE );
275 nPre_LinePos = 0;
276 bPre_IgnoreNewPara = false;
278 RefGuard aRefGuard(*this);
280 Continue( HtmlTokenId::NONE );
282 return eState;
285 void HTMLParser::Continue( HtmlTokenId nToken )
287 if( nToken == HtmlTokenId::NONE )
288 nToken = GetNextToken();
290 while( IsParserWorking() )
292 SaveState( nToken );
293 nToken = FilterToken( nToken );
295 if( nToken != HtmlTokenId::NONE )
296 NextToken( nToken );
298 if( IsParserWorking() )
299 SaveState( HtmlTokenId::NONE ); // continue with new token
301 nToken = GetNextToken();
305 HtmlTokenId HTMLParser::FilterToken( HtmlTokenId nToken )
307 switch( nToken )
309 case HtmlTokenId(EOF):
310 nToken = HtmlTokenId::NONE;
311 break; // don't pass
313 case HtmlTokenId::HEAD_OFF:
314 bIsInHeader = false;
315 break;
317 case HtmlTokenId::HEAD_ON:
318 bIsInHeader = true;
319 break;
321 case HtmlTokenId::BODY_ON:
322 bIsInHeader = false;
323 break;
325 case HtmlTokenId::FRAMESET_ON:
326 bIsInHeader = false;
327 break;
329 case HtmlTokenId::BODY_OFF:
330 bReadPRE = bReadListing = bReadXMP = false;
331 break;
333 case HtmlTokenId::HTML_OFF:
334 nToken = HtmlTokenId::NONE;
335 bReadPRE = bReadListing = bReadXMP = false;
336 break; // HtmlTokenId::ON hasn't been passed either !
338 case HtmlTokenId::PREFORMTXT_ON:
339 StartPRE();
340 break;
342 case HtmlTokenId::PREFORMTXT_OFF:
343 FinishPRE();
344 break;
346 case HtmlTokenId::LISTING_ON:
347 StartListing();
348 break;
350 case HtmlTokenId::LISTING_OFF:
351 FinishListing();
352 break;
354 case HtmlTokenId::XMP_ON:
355 StartXMP();
356 break;
358 case HtmlTokenId::XMP_OFF:
359 FinishXMP();
360 break;
362 default:
363 if( bReadPRE )
364 nToken = FilterPRE( nToken );
365 else if( bReadListing )
366 nToken = FilterListing( nToken );
367 else if( bReadXMP )
368 nToken = FilterXMP( nToken );
370 break;
373 return nToken;
376 namespace {
378 constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; }
380 constexpr bool HTML_ISSPACE(sal_uInt32 c)
382 return ' ' == c || '\t' == c || '\r' == c || '\n' == c || '\x0b' == c;
387 HtmlTokenId HTMLParser::ScanText(const sal_Unicode cBreak)
389 OUStringBuffer sTmpBuffer( MAX_LEN );
390 bool bContinue = true;
391 bool bEqSignFound = false;
392 sal_uInt32 cQuote = 0U;
394 while( bContinue && IsParserWorking() )
396 bool bNextCh = true;
397 switch( nNextCh )
399 case '&':
400 bEqSignFound = false;
401 if( bReadXMP )
402 sTmpBuffer.append( '&' );
403 else
405 sal_uInt64 nStreamPos = rInput.Tell();
406 sal_uInt32 nLinePos = GetLinePos();
408 sal_uInt32 cChar = 0U;
409 if( '#' == (nNextCh = GetNextChar()) )
411 nNextCh = GetNextChar();
412 const bool bIsHex( 'x' == nNextCh );
413 const bool bIsDecOrHex( bIsHex || rtl::isAsciiDigit(nNextCh) );
414 if ( bIsDecOrHex )
416 if ( bIsHex )
418 nNextCh = GetNextChar();
419 while ( rtl::isAsciiHexDigit(nNextCh) )
421 cChar = cChar * 16U +
422 ( nNextCh <= '9'
423 ? sal_uInt32( nNextCh - '0' )
424 : ( nNextCh <= 'F'
425 ? sal_uInt32( nNextCh - 'A' + 10 )
426 : sal_uInt32( nNextCh - 'a' + 10 ) ) );
427 nNextCh = GetNextChar();
430 else
434 cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
435 nNextCh = GetNextChar();
437 while( rtl::isAsciiDigit(nNextCh) );
440 if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
441 RTL_TEXTENCODING_UCS2 != eSrcEnc &&
442 RTL_TEXTENCODING_UTF8 != eSrcEnc &&
443 cChar < 256 )
445 const sal_uInt32 convertFlags =
446 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
447 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
448 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
450 char cEncodedChar = static_cast<char>(cChar);
451 cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
452 if( 0U == cChar )
454 // If the character could not be
455 // converted, because a conversion is not
456 // available, do no conversion at all.
457 cChar = cEncodedChar;
461 else
462 nNextCh = 0U;
464 if (!rtl::isUnicodeCodePoint(cChar)
465 || (linguistic::IsControlChar(cChar)
466 && cChar != '\r' && cChar != '\n' && cChar != '\t'))
468 cChar = '?';
471 else if( rtl::isAsciiAlpha( nNextCh ) )
473 OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
474 sal_Int32 nPos = 0;
477 sEntityBuffer.appendUtf32( nNextCh );
478 nPos++;
479 nNextCh = GetNextChar();
481 while( nPos < MAX_ENTITY_LEN && rtl::isAsciiAlphanumeric( nNextCh ) &&
482 !rInput.eof() );
484 if( IsParserWorking() && !rInput.eof() )
486 std::u16string_view sEntity(sEntityBuffer.subView(0, nPos));
487 cChar = GetHTMLCharName( sEntity );
489 // not found ( == 0 ): plain text
490 // or a character which is inserted as attribute
491 if( 0U == cChar && ';' != nNextCh )
493 DBG_ASSERT( rInput.Tell() - nStreamPos ==
494 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
495 "UTF-8 is failing here" );
496 for( sal_Int32 i = nPos-1; i>1; i-- )
498 nNextCh = sEntityBuffer[i];
499 sEntityBuffer.setLength( i );
500 sEntity = sEntityBuffer.subView(0, i);
501 cChar = GetHTMLCharName( sEntity );
502 if( cChar )
504 rInput.SeekRel( -static_cast<sal_Int64>
505 (nPos-i)*GetCharSize() );
506 nlLinePos -= sal_uInt32(nPos-i);
507 nPos = i;
508 ClearTxtConvContext();
509 break;
514 if( !cChar ) // unknown character?
516 // back in stream, insert '&'
517 // and restart with next character
518 sTmpBuffer.append( '&' );
520 DBG_ASSERT( rInput.Tell()-nStreamPos ==
521 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
522 "Wrong stream position" );
523 DBG_ASSERT( nlLinePos-nLinePos ==
524 static_cast<sal_uInt32>(nPos+1),
525 "Wrong line position" );
526 rInput.Seek( nStreamPos );
527 nlLinePos = nLinePos;
528 ClearTxtConvContext();
529 break;
532 assert(cChar != 0);
534 // 1 == Non Breaking Space
535 // 2 == SoftHyphen
537 if (cChar == 1 || cChar == 2)
539 if( '>' == cBreak )
541 // When reading the content of a tag we have
542 // to change it to ' ' or '-'
543 if( 1U == cChar )
544 cChar = ' ';
545 else //2U
546 cChar = '-';
548 else
550 // If not scanning a tag return token
551 aToken.append( sTmpBuffer );
552 sTmpBuffer.setLength(0);
554 if( !aToken.isEmpty() )
556 // restart with character
557 nNextCh = '&';
558 DBG_ASSERT( rInput.Tell()-nStreamPos ==
559 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
560 "Wrong stream position" );
561 DBG_ASSERT( nlLinePos-nLinePos ==
562 static_cast<sal_uInt32>(nPos+1),
563 "Wrong line position" );
564 rInput.Seek( nStreamPos );
565 nlLinePos = nLinePos;
566 ClearTxtConvContext();
567 return HtmlTokenId::TEXTTOKEN;
570 // Hack: _GetNextChar shall not read the
571 // next character
572 if( ';' != nNextCh )
573 aToken.append( " " );
574 if( 1U == cChar )
575 return HtmlTokenId::NONBREAKSPACE;
576 else //2U
577 return HtmlTokenId::SOFTHYPH;
581 else
582 nNextCh = 0U;
584 // &{...};-JavaScript-Macros are not supported any longer.
585 else if( IsParserWorking() )
587 sTmpBuffer.append( '&' );
588 bNextCh = false;
589 break;
592 bNextCh = (';' == nNextCh);
593 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
594 cChar=='\"' || cChar==' ') )
596 // ' and " have to be escaped within tags to separate
597 // them from ' and " enclosing options.
598 // \ has to be escaped as well.
599 // Space is protected because it's not a delimiter between
600 // options.
601 sTmpBuffer.append( '\\' );
603 if( IsParserWorking() )
605 if( cChar )
606 sTmpBuffer.appendUtf32( cChar );
608 else if( SvParserState::Pending==eState && '>'!=cBreak )
610 // Restart with '&', the remainder is returned as
611 // text token.
612 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
614 // _GetNextChar() returns the previous text and
615 // during the next execution a new character is read.
616 // Thus we have to position in front of the '&'.
617 nNextCh = 0U;
618 rInput.Seek( nStreamPos - GetCharSize() );
619 nlLinePos = nLinePos-1;
620 ClearTxtConvContext();
621 bReadNextChar = true;
623 bNextCh = false;
626 break;
627 case '=':
628 if( '>'==cBreak && !cQuote )
629 bEqSignFound = true;
630 sTmpBuffer.appendUtf32( nNextCh );
631 break;
633 case '\\':
634 if( '>'==cBreak )
636 // mark within tags
637 sTmpBuffer.append( '\\' );
639 sTmpBuffer.append( '\\' );
640 break;
642 case '\"':
643 case '\'':
644 if( '>'==cBreak )
646 if( bEqSignFound )
647 cQuote = nNextCh;
648 else if( cQuote && (cQuote==nNextCh ) )
649 cQuote = 0U;
651 sTmpBuffer.appendUtf32( nNextCh );
652 bEqSignFound = false;
653 break;
655 case sal_Unicode(EOF):
656 if( rInput.eof() )
658 bContinue = false;
660 // else: ignore, not a valid code point
661 break;
663 case '<':
664 bEqSignFound = false;
665 if( '>'==cBreak )
666 sTmpBuffer.appendUtf32( nNextCh );
667 else
668 bContinue = false; // break, string is together
669 break;
671 case '\f':
672 if( '>' == cBreak )
674 // If scanning options treat it like a space, ...
675 sTmpBuffer.append( ' ' );
677 else
679 // otherwise it's a separate token.
680 bContinue = false;
682 break;
684 case '\r':
685 case '\n':
686 if( '>'==cBreak )
688 // cr/lf in tag is handled in GetNextToken_()
689 sTmpBuffer.appendUtf32( nNextCh );
690 break;
692 else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
694 bContinue = false;
695 break;
697 // Reduce sequence of CR/LF/BLANK/TAB to a single blank
698 [[fallthrough]];
699 case '\t':
700 if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
702 // Pass Tabs up in <PRE>
703 bContinue = false;
704 break;
706 [[fallthrough]];
707 case '\x0b':
708 if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
709 '>'!=cBreak )
711 break;
713 if (!m_bPreserveSpaces)
714 nNextCh = ' ';
715 [[fallthrough]];
716 case ' ':
717 if (!m_bPreserveSpaces)
719 sTmpBuffer.appendUtf32(nNextCh);
720 if ('>' != cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea))
722 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
725 nNextCh = GetNextChar();
726 if (sal_Unicode(EOF) == nNextCh && rInput.eof())
728 if (!aToken.isEmpty() || sTmpBuffer.getLength() > 1)
730 // Have seen s.th. aside from blanks?
731 aToken.append(sTmpBuffer);
732 sTmpBuffer.setLength(0);
733 return HtmlTokenId::TEXTTOKEN;
735 else
736 // Only read blanks: no text must be returned
737 // and GetNextToken_ has to read until EOF
738 return HtmlTokenId::NONE;
740 } while (HTML_ISSPACE(nNextCh));
741 bNextCh = false;
743 break;
745 [[fallthrough]];
746 default:
747 bEqSignFound = false;
748 if (nNextCh == cBreak && !cQuote)
749 bContinue = false;
750 else
752 do {
753 if (!linguistic::IsControlChar(nNextCh) || HTML_ISSPACE(nNextCh))
755 // All remaining characters make their way into the text.
756 sTmpBuffer.appendUtf32( nNextCh );
759 nNextCh = GetNextChar();
760 if( ( sal_Unicode(EOF) == nNextCh && rInput.eof() ) ||
761 !IsParserWorking() )
763 if( !sTmpBuffer.isEmpty() )
764 aToken.append( sTmpBuffer );
765 return HtmlTokenId::TEXTTOKEN;
767 } while( rtl::isAsciiAlpha( nNextCh ) || rtl::isAsciiDigit( nNextCh ) );
768 bNextCh = false;
772 if( bContinue && bNextCh )
773 nNextCh = GetNextChar();
776 if( !sTmpBuffer.isEmpty() )
777 aToken.append( sTmpBuffer );
779 return HtmlTokenId::TEXTTOKEN;
782 HtmlTokenId HTMLParser::GetNextRawToken()
784 OUStringBuffer sTmpBuffer( MAX_LEN );
786 if( bEndTokenFound )
788 // During the last execution we already found the end token,
789 // thus we don't have to search it again.
790 bReadScript = false;
791 bReadStyle = false;
792 aEndToken.clear();
793 bEndTokenFound = false;
795 return HtmlTokenId::NONE;
798 // Default return value: HtmlTokenId::RAWDATA
799 bool bContinue = true;
800 HtmlTokenId nToken = HtmlTokenId::RAWDATA;
801 SaveState( HtmlTokenId::NONE );
802 while( bContinue && IsParserWorking() )
804 bool bNextCh = true;
805 switch( nNextCh )
807 case '<':
809 // Maybe we've reached the end.
811 // Save what we have read previously...
812 aToken.append( sTmpBuffer );
813 sTmpBuffer.setLength(0);
815 // and remember position in stream.
816 sal_uInt64 nStreamPos = rInput.Tell();
817 sal_uInt32 nLineNr = GetLineNr();
818 sal_uInt32 nLinePos = GetLinePos();
820 // Start of an end token?
821 bool bOffState = false;
822 if( '/' == (nNextCh = GetNextChar()) )
824 bOffState = true;
825 nNextCh = GetNextChar();
827 else if( '!' == nNextCh )
829 sTmpBuffer.appendUtf32( nNextCh );
830 nNextCh = GetNextChar();
833 // Read following letters
834 while( (rtl::isAsciiAlpha(nNextCh) || '-'==nNextCh) &&
835 IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
837 sTmpBuffer.appendUtf32( nNextCh );
838 nNextCh = GetNextChar();
841 OUString aTok( sTmpBuffer.toString() );
842 aTok = aTok.toAsciiLowerCase();
843 bool bDone = false;
844 if( bReadScript || !aEndToken.isEmpty() )
846 if( !bReadComment )
848 if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) )
850 bReadComment = true;
852 else
854 // A script has to end with "</SCRIPT>". But
855 // ">" is optional for security reasons
856 bDone = bOffState &&
857 ( bReadScript
858 ? aTok == OOO_STRING_SVTOOLS_HTML_script
859 : aTok == aEndToken );
862 if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
864 // End of comment of style <!----->
865 bReadComment = false;
868 else
870 // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
871 if( bOffState )
872 bDone = aTok == OOO_STRING_SVTOOLS_HTML_style ||
873 aTok == OOO_STRING_SVTOOLS_HTML_head;
874 else
875 bDone = aTok == OOO_STRING_SVTOOLS_HTML_body;
878 if( bDone )
880 // Done! Return the previously read string (if requested)
881 // and continue.
883 bContinue = false;
885 // nToken==0 means, GetNextToken_ continues to read
886 if( aToken.isEmpty() && (bReadStyle || bReadScript) )
888 // Immediately close environment (or context?)
889 // and parse the end token
890 bReadScript = false;
891 bReadStyle = false;
892 aEndToken.clear();
893 nToken = HtmlTokenId::NONE;
895 else
897 // Keep bReadScript/bReadStyle alive
898 // and parse end token during next execution
899 bEndTokenFound = true;
902 // Move backwards in stream to '<'
903 rInput.Seek( nStreamPos );
904 SetLineNr( nLineNr );
905 SetLinePos( nLinePos );
906 ClearTxtConvContext();
907 nNextCh = '<';
909 // Don't append string to token.
910 sTmpBuffer.setLength( 0 );
912 else
914 // remember "</" , everything else we find in the buffer
915 aToken.append( "<" );
916 if( bOffState )
917 aToken.append( "/" );
919 bNextCh = false;
922 break;
923 case '-':
924 sTmpBuffer.appendUtf32( nNextCh );
925 if( bReadComment )
927 bool bTwoMinus = false;
928 nNextCh = GetNextChar();
929 while( '-' == nNextCh && IsParserWorking() )
931 bTwoMinus = true;
932 sTmpBuffer.appendUtf32( nNextCh );
933 nNextCh = GetNextChar();
936 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
937 bReadComment = false;
939 bNextCh = false;
941 break;
943 case '\r':
944 // \r\n? closes the current text token (even if it's empty)
945 nNextCh = GetNextChar();
946 if( nNextCh=='\n' )
947 nNextCh = GetNextChar();
948 bContinue = false;
949 break;
950 case '\n':
951 // \n closes the current text token (even if it's empty)
952 nNextCh = GetNextChar();
953 bContinue = false;
954 break;
955 case sal_Unicode(EOF):
956 // eof closes the current text token and behaves like having read
957 // an end token
958 if( rInput.eof() )
960 bContinue = false;
961 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
963 bEndTokenFound = true;
965 else
967 bReadScript = false;
968 bReadStyle = false;
969 aEndToken.clear();
970 nToken = HtmlTokenId::NONE;
973 break;
974 default:
975 if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t')
977 // all remaining characters are appended to the buffer
978 sTmpBuffer.appendUtf32( nNextCh );
980 break;
983 if( !bContinue && !sTmpBuffer.isEmpty() )
985 aToken.append( sTmpBuffer );
986 sTmpBuffer.setLength(0);
989 if( bContinue && bNextCh )
990 nNextCh = GetNextChar();
993 if( IsParserWorking() )
994 SaveState( HtmlTokenId::NONE );
995 else
996 nToken = HtmlTokenId::NONE;
998 return nToken;
1001 // Scan next token
1002 HtmlTokenId HTMLParser::GetNextToken_()
1004 HtmlTokenId nRet = HtmlTokenId::NONE;
1005 sSaveToken.clear();
1007 if (mnPendingOffToken != HtmlTokenId::NONE)
1009 // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
1010 nRet = mnPendingOffToken;
1011 mnPendingOffToken = HtmlTokenId::NONE;
1012 aToken.setLength( 0 );
1013 return nRet;
1016 // Delete options
1017 maOptions.clear();
1019 if( !IsParserWorking() ) // Don't continue if already an error occurred
1020 return HtmlTokenId::NONE;
1022 bool bReadNextCharSave = bReadNextChar;
1023 if( bReadNextChar )
1025 DBG_ASSERT( !bEndTokenFound,
1026 "Read a character despite </SCRIPT> was read?" );
1027 nNextCh = GetNextChar();
1028 if( !IsParserWorking() ) // Don't continue if already an error occurred
1029 return HtmlTokenId::NONE;
1030 bReadNextChar = false;
1033 if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
1035 nRet = GetNextRawToken();
1036 if( nRet != HtmlTokenId::NONE || !IsParserWorking() )
1037 return nRet;
1040 do {
1041 bool bNextCh = true;
1042 switch( nNextCh )
1044 case '<':
1046 sal_uInt64 nStreamPos = rInput.Tell();
1047 sal_uInt32 nLineNr = GetLineNr();
1048 sal_uInt32 nLinePos = GetLinePos();
1050 bool bOffState = false;
1051 if( '/' == (nNextCh = GetNextChar()) )
1053 bOffState = true;
1054 nNextCh = GetNextChar();
1056 // Assume '<?' is a start of an XML declaration, ignore it.
1057 if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?')
1059 OUStringBuffer sTmpBuffer;
1060 do {
1061 sTmpBuffer.appendUtf32( nNextCh );
1062 nNextCh = GetNextChar();
1063 if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
1064 break;
1065 if (bFuzzing && sTmpBuffer.getLength() > 1024)
1067 SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
1068 eState = SvParserState::Error;
1069 break;
1071 } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
1072 !linguistic::IsControlChar(nNextCh) &&
1073 IsParserWorking() && !rInput.eof() );
1075 if( !sTmpBuffer.isEmpty() )
1077 aToken.append( sTmpBuffer );
1078 sTmpBuffer.setLength(0);
1081 // Skip blanks
1082 while( rtl::isAsciiWhiteSpace( nNextCh ) && IsParserWorking() )
1083 nNextCh = GetNextChar();
1085 if( !IsParserWorking() )
1087 if( SvParserState::Pending == eState )
1088 bReadNextChar = bReadNextCharSave;
1089 break;
1092 // Search token in table:
1093 sSaveToken = aToken;
1094 aToken = aToken.toString().toAsciiLowerCase();
1096 if (!maNamespace.isEmpty() && o3tl::starts_with(aToken, maNamespace))
1097 aToken.remove( 0, maNamespace.getLength());
1099 if( HtmlTokenId::NONE == (nRet = GetHTMLToken( aToken )) )
1100 // Unknown control
1101 nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1103 // If it's a token which can be switched off...
1104 if( bOffState )
1106 if( nRet >= HtmlTokenId::ONOFF_START )
1108 // and there is an off token, return off token instead
1109 nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);
1111 else if( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty())
1113 // and there is no off token, return unknown token.
1114 // (except for </BR>, that is treated like <BR>)
1115 // No exception for XHTML, though.
1116 nRet = HtmlTokenId::UNKNOWNCONTROL_OFF;
1120 if( nRet == HtmlTokenId::COMMENT )
1122 // fix: due to being case sensitive use sSaveToken as start of comment
1123 // and append a blank.
1124 aToken = sSaveToken;
1125 if( '>'!=nNextCh )
1126 aToken.append( " " );
1127 sal_uInt64 nCStreamPos = 0;
1128 sal_uInt32 nCLineNr = 0;
1129 sal_uInt32 nCLinePos = 0;
1130 sal_Int32 nCStrLen = 0;
1132 bool bDone = false;
1133 // Read until closing -->. If not found restart at first >
1134 sTmpBuffer = aToken;
1135 while( !bDone && !rInput.eof() && IsParserWorking() )
1137 if( '>'==nNextCh )
1139 if( !nCStreamPos )
1141 nCStreamPos = rInput.Tell();
1142 nCStrLen = sTmpBuffer.getLength();
1143 nCLineNr = GetLineNr();
1144 nCLinePos = GetLinePos();
1146 bDone = sTmpBuffer.getLength() >= 2 && sTmpBuffer[sTmpBuffer.getLength() - 2] == '-' && sTmpBuffer[sTmpBuffer.getLength() - 1] == '-';
1147 if( !bDone )
1148 sTmpBuffer.appendUtf32(nNextCh);
1150 else if (!linguistic::IsControlChar(nNextCh)
1151 || nNextCh == '\r' || nNextCh == '\n' || nNextCh == '\t')
1153 sTmpBuffer.appendUtf32(nNextCh);
1155 if( !bDone )
1156 nNextCh = GetNextChar();
1158 aToken = sTmpBuffer;
1159 sTmpBuffer.setLength(0);
1160 if( !bDone && IsParserWorking() && nCStreamPos )
1162 rInput.Seek( nCStreamPos );
1163 SetLineNr( nCLineNr );
1164 SetLinePos( nCLinePos );
1165 ClearTxtConvContext();
1166 aToken.truncate(nCStrLen);
1167 nNextCh = '>';
1170 else if (nRet == HtmlTokenId::CDATA)
1172 // Read until the closing ]]>.
1173 bool bDone = false;
1174 while (!bDone && !rInput.eof() && IsParserWorking())
1176 if (nNextCh == '>')
1178 if (sTmpBuffer.getLength() >= 2)
1180 bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
1181 && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
1182 if (bDone)
1184 // Ignore ]] at the end.
1185 sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
1188 if (!bDone)
1190 sTmpBuffer.appendUtf32(nNextCh);
1193 else if (!linguistic::IsControlChar(nNextCh))
1195 sTmpBuffer.appendUtf32(nNextCh);
1197 if (!bDone)
1199 nNextCh = GetNextChar();
1202 aToken = sTmpBuffer;
1203 sTmpBuffer.setLength(0);
1205 else
1207 // TokenString not needed anymore
1208 aToken.setLength( 0 );
1211 // Read until closing '>'
1212 if( '>' != nNextCh && IsParserWorking() )
1214 ScanText( '>' );
1216 // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1217 // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
1218 // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
1219 // which lead to fdo#56772.
1220 if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/"))
1222 mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1); // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
1223 aToken.setLength( aToken.getLength()-1 ); // remove trailing '/'
1225 if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1227 // Move back in front of < and restart there.
1228 // Return < as text.
1229 rInput.Seek( nStreamPos );
1230 SetLineNr( nLineNr );
1231 SetLinePos( nLinePos );
1232 ClearTxtConvContext();
1234 aToken = "<";
1235 nRet = HtmlTokenId::TEXTTOKEN;
1236 nNextCh = GetNextChar();
1237 bNextCh = false;
1238 break;
1241 if( SvParserState::Pending == eState )
1242 bReadNextChar = bReadNextCharSave;
1244 else
1246 if( bOffState )
1248 // simply throw away everything
1249 ScanText( '>' );
1250 if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1252 // Move back in front of < and restart there.
1253 // Return < as text.
1254 rInput.Seek( nStreamPos );
1255 SetLineNr( nLineNr );
1256 SetLinePos( nLinePos );
1257 ClearTxtConvContext();
1259 aToken = "<";
1260 nRet = HtmlTokenId::TEXTTOKEN;
1261 nNextCh = GetNextChar();
1262 bNextCh = false;
1263 break;
1265 if( SvParserState::Pending == eState )
1266 bReadNextChar = bReadNextCharSave;
1267 aToken.setLength( 0 );
1269 else if( '%' == nNextCh )
1271 nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1273 sal_uInt64 nCStreamPos = rInput.Tell();
1274 sal_uInt32 nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1276 bool bDone = false;
1277 // Read until closing %>. If not found restart at first >.
1278 sal_Unicode nLastTokenChar = !aToken.isEmpty() ? aToken[aToken.getLength() - 1] : 0;
1279 OUStringBuffer aTmpBuffer(aToken);
1280 while( !bDone && !rInput.eof() && IsParserWorking() )
1282 bDone = '>'==nNextCh && nLastTokenChar == '%';
1283 if( !bDone )
1285 aTmpBuffer.appendUtf32(nNextCh);
1286 nLastTokenChar = aTmpBuffer[aTmpBuffer.getLength() - 1];
1287 nNextCh = GetNextChar();
1290 if( !bDone && IsParserWorking() )
1292 rInput.Seek( nCStreamPos );
1293 SetLineNr( nCLineNr );
1294 SetLinePos( nCLinePos );
1295 ClearTxtConvContext();
1296 aToken = "<%";
1297 nRet = HtmlTokenId::TEXTTOKEN;
1298 break;
1300 aToken = aTmpBuffer;
1301 aTmpBuffer.setLength(0);
1302 if( IsParserWorking() )
1304 sSaveToken = aToken;
1305 aToken.setLength( 0 );
1308 else
1310 aToken = "<";
1311 nRet = HtmlTokenId::TEXTTOKEN;
1312 bNextCh = false;
1313 break;
1317 if( IsParserWorking() )
1319 bNextCh = '>' == nNextCh;
1320 switch( nRet )
1322 case HtmlTokenId::TEXTAREA_ON:
1323 bReadTextArea = true;
1324 break;
1325 case HtmlTokenId::TEXTAREA_OFF:
1326 bReadTextArea = false;
1327 break;
1328 case HtmlTokenId::SCRIPT_ON:
1329 if( !bReadTextArea )
1330 bReadScript = true;
1331 break;
1332 case HtmlTokenId::SCRIPT_OFF:
1333 if( !bReadTextArea )
1335 bReadScript = false;
1336 // JavaScript might modify the stream,
1337 // thus the last character has to be read again.
1338 bReadNextChar = true;
1339 bNextCh = false;
1341 break;
1343 case HtmlTokenId::STYLE_ON:
1344 bReadStyle = true;
1345 break;
1346 case HtmlTokenId::STYLE_OFF:
1347 bReadStyle = false;
1348 break;
1349 default: break;
1353 break;
1355 case sal_Unicode(EOF):
1356 if( rInput.eof() )
1358 eState = SvParserState::Accepted;
1359 nRet = HtmlTokenId(nNextCh);
1361 else
1363 // Read normal text.
1364 goto scan_text;
1366 break;
1368 case '\f':
1369 // form feeds are passed upwards separately
1370 nRet = HtmlTokenId::LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
1371 break;
1373 case '\n':
1374 case '\r':
1375 if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1377 sal_Unicode c = GetNextChar();
1378 if( ( '\n' != nNextCh || '\r' != c ) &&
1379 ( '\r' != nNextCh || '\n' != c ) )
1381 bNextCh = false;
1382 nNextCh = c;
1384 nRet = HtmlTokenId::NEWPARA;
1385 break;
1387 [[fallthrough]];
1388 case '\t':
1389 if( bReadPRE )
1391 nRet = HtmlTokenId::TABCHAR;
1392 break;
1394 [[fallthrough]];
1395 case ' ':
1396 [[fallthrough]];
1397 default:
1399 scan_text:
1400 // "normal" text to come
1401 nRet = ScanText();
1402 bNextCh = 0 == aToken.getLength();
1404 // the text should be processed
1405 if( !bNextCh && eState == SvParserState::Pending )
1407 eState = SvParserState::Working;
1408 bReadNextChar = true;
1411 break;
1414 if( bNextCh && SvParserState::Working == eState )
1416 nNextCh = GetNextChar();
1417 if( SvParserState::Pending == eState && nRet != HtmlTokenId::NONE && HtmlTokenId::TEXTTOKEN != nRet )
1419 bReadNextChar = true;
1420 eState = SvParserState::Working;
1424 } while( nRet == HtmlTokenId::NONE && SvParserState::Working == eState );
1426 if( SvParserState::Pending == eState )
1427 nRet = HtmlTokenId::INVALID; // s.th. invalid
1429 return nRet;
1432 void HTMLParser::UnescapeToken()
1434 sal_Int32 nPos=0;
1436 bool bEscape = false;
1437 while( nPos < aToken.getLength() )
1439 bool bOldEscape = bEscape;
1440 bEscape = false;
1441 if( '\\'==aToken[nPos] && !bOldEscape )
1443 aToken.remove( nPos, 1 );
1444 bEscape = true;
1446 else
1448 nPos++;
1453 const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken )
1455 // If the options for the current token have already been returned,
1456 // return them once again.
1457 if (!maOptions.empty())
1458 return maOptions;
1460 sal_Int32 nPos = 0;
1461 while( nPos < aToken.getLength() )
1463 // A letter? Option beginning here.
1464 if( rtl::isAsciiAlpha( aToken[nPos] ) )
1466 HtmlOptionId nToken;
1467 OUString aValue;
1468 sal_Int32 nStt = nPos;
1469 sal_Unicode cChar = 0;
1471 // Actually only certain characters allowed.
1472 // Netscape only looks for "=" and white space (c.f.
1473 // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
1474 while( nPos < aToken.getLength() )
1476 cChar = aToken[nPos];
1477 if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) )
1478 break;
1479 nPos++;
1482 OUString sName( aToken.subView( nStt, nPos-nStt ) );
1484 // PlugIns require original token name. Convert to lower case only for searching.
1485 nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
1486 SAL_WARN_IF( nToken==HtmlOptionId::UNKNOWN, "svtools",
1487 "GetOption: unknown HTML option '" << sName << "'" );
1488 bool bStripCRLF = (nToken < HtmlOptionId::SCRIPT_START ||
1489 nToken >= HtmlOptionId::SCRIPT_END) &&
1490 (!pNoConvertToken || nToken != *pNoConvertToken);
1492 while( nPos < aToken.getLength() )
1494 cChar = aToken[nPos];
1495 if ( HTML_ISPRINTABLE(cChar) && !rtl::isAsciiWhiteSpace(cChar) )
1496 break;
1497 nPos++;
1500 // Option with value?
1501 if( nPos!=aToken.getLength() && '='==cChar )
1503 nPos++;
1505 while( nPos < aToken.getLength() )
1507 cChar = aToken[nPos];
1508 if ( HTML_ISPRINTABLE(cChar) && ' ' != cChar && '\t' != cChar && '\r' != cChar && '\n' != cChar )
1509 break;
1510 nPos++;
1513 if( nPos != aToken.getLength() )
1515 sal_Int32 nLen = 0;
1516 nStt = nPos;
1517 if( ('"'==cChar) || '\''==cChar )
1519 sal_Unicode cEnd = cChar;
1520 nPos++; nStt++;
1521 bool bDone = false;
1522 bool bEscape = false;
1523 while( nPos < aToken.getLength() && !bDone )
1525 bool bOldEscape = bEscape;
1526 bEscape = false;
1527 cChar = aToken[nPos];
1528 switch( cChar )
1530 case '\r':
1531 case '\n':
1532 if( bStripCRLF )
1533 aToken.remove( nPos, 1 );
1534 else
1536 nPos++;
1537 nLen++;
1539 break;
1540 case '\\':
1541 if( bOldEscape )
1543 nPos++;
1544 nLen++;
1546 else
1548 aToken.remove( nPos, 1 );
1549 bEscape = true;
1551 break;
1552 case '"':
1553 case '\'':
1554 bDone = !bOldEscape && cChar==cEnd;
1555 if( !bDone )
1557 nPos++;
1558 nLen++;
1560 break;
1561 default:
1562 nPos++;
1563 nLen++;
1564 break;
1567 if( nPos!=aToken.getLength() )
1568 nPos++;
1570 else
1572 // More liberal than the standard: allow all printable characters
1573 bool bEscape = false;
1574 bool bDone = false;
1575 while( nPos < aToken.getLength() && !bDone )
1577 bool bOldEscape = bEscape;
1578 bEscape = false;
1579 sal_Unicode c = aToken[nPos];
1580 switch( c )
1582 case ' ':
1583 bDone = !bOldEscape;
1584 if( !bDone )
1586 nPos++;
1587 nLen++;
1589 break;
1591 case '\t':
1592 case '\r':
1593 case '\n':
1594 bDone = true;
1595 break;
1597 case '\\':
1598 if( bOldEscape )
1600 nPos++;
1601 nLen++;
1603 else
1605 aToken.remove( nPos, 1 );
1606 bEscape = true;
1608 break;
1610 default:
1611 if( HTML_ISPRINTABLE( c ) )
1613 nPos++;
1614 nLen++;
1616 else
1617 bDone = true;
1618 break;
1623 if( nLen )
1624 aValue = aToken.subView( nStt, nLen );
1628 // Token is known and can be saved
1629 maOptions.emplace_back(nToken, sName, aValue);
1632 else
1633 // Ignore white space and unexpected characters
1634 nPos++;
1637 return maOptions;
1640 HtmlTokenId HTMLParser::FilterPRE( HtmlTokenId nToken )
1642 switch( nToken )
1644 // in Netscape they only have impact in not empty paragraphs
1645 case HtmlTokenId::PARABREAK_ON:
1646 nToken = HtmlTokenId::LINEBREAK;
1647 [[fallthrough]];
1648 case HtmlTokenId::LINEBREAK:
1649 case HtmlTokenId::NEWPARA:
1650 nPre_LinePos = 0;
1651 if( bPre_IgnoreNewPara )
1652 nToken = HtmlTokenId::NONE;
1653 break;
1655 case HtmlTokenId::TABCHAR:
1657 sal_Int32 nSpaces = 8 - (nPre_LinePos % 8);
1658 DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
1659 if (aToken.getLength() < nSpaces)
1661 using comphelper::string::padToLength;
1662 OUStringBuffer aBuf(aToken);
1663 aToken = padToLength(aBuf, nSpaces, ' ');
1665 nPre_LinePos += nSpaces;
1666 nToken = HtmlTokenId::TEXTTOKEN;
1668 break;
1669 // Keep those
1670 case HtmlTokenId::TEXTTOKEN:
1671 nPre_LinePos += aToken.getLength();
1672 break;
1674 case HtmlTokenId::SELECT_ON:
1675 case HtmlTokenId::SELECT_OFF:
1676 case HtmlTokenId::BODY_ON:
1677 case HtmlTokenId::FORM_ON:
1678 case HtmlTokenId::FORM_OFF:
1679 case HtmlTokenId::INPUT:
1680 case HtmlTokenId::OPTION:
1681 case HtmlTokenId::TEXTAREA_ON:
1682 case HtmlTokenId::TEXTAREA_OFF:
1684 case HtmlTokenId::IMAGE:
1685 case HtmlTokenId::APPLET_ON:
1686 case HtmlTokenId::APPLET_OFF:
1687 case HtmlTokenId::PARAM:
1688 case HtmlTokenId::EMBED:
1690 case HtmlTokenId::HEAD1_ON:
1691 case HtmlTokenId::HEAD1_OFF:
1692 case HtmlTokenId::HEAD2_ON:
1693 case HtmlTokenId::HEAD2_OFF:
1694 case HtmlTokenId::HEAD3_ON:
1695 case HtmlTokenId::HEAD3_OFF:
1696 case HtmlTokenId::HEAD4_ON:
1697 case HtmlTokenId::HEAD4_OFF:
1698 case HtmlTokenId::HEAD5_ON:
1699 case HtmlTokenId::HEAD5_OFF:
1700 case HtmlTokenId::HEAD6_ON:
1701 case HtmlTokenId::HEAD6_OFF:
1702 case HtmlTokenId::BLOCKQUOTE_ON:
1703 case HtmlTokenId::BLOCKQUOTE_OFF:
1704 case HtmlTokenId::ADDRESS_ON:
1705 case HtmlTokenId::ADDRESS_OFF:
1706 case HtmlTokenId::HORZRULE:
1708 case HtmlTokenId::CENTER_ON:
1709 case HtmlTokenId::CENTER_OFF:
1710 case HtmlTokenId::DIVISION_ON:
1711 case HtmlTokenId::DIVISION_OFF:
1713 case HtmlTokenId::SCRIPT_ON:
1714 case HtmlTokenId::SCRIPT_OFF:
1715 case HtmlTokenId::RAWDATA:
1717 case HtmlTokenId::TABLE_ON:
1718 case HtmlTokenId::TABLE_OFF:
1719 case HtmlTokenId::CAPTION_ON:
1720 case HtmlTokenId::CAPTION_OFF:
1721 case HtmlTokenId::COLGROUP_ON:
1722 case HtmlTokenId::COLGROUP_OFF:
1723 case HtmlTokenId::COL_ON:
1724 case HtmlTokenId::COL_OFF:
1725 case HtmlTokenId::THEAD_ON:
1726 case HtmlTokenId::THEAD_OFF:
1727 case HtmlTokenId::TFOOT_ON:
1728 case HtmlTokenId::TFOOT_OFF:
1729 case HtmlTokenId::TBODY_ON:
1730 case HtmlTokenId::TBODY_OFF:
1731 case HtmlTokenId::TABLEROW_ON:
1732 case HtmlTokenId::TABLEROW_OFF:
1733 case HtmlTokenId::TABLEDATA_ON:
1734 case HtmlTokenId::TABLEDATA_OFF:
1735 case HtmlTokenId::TABLEHEADER_ON:
1736 case HtmlTokenId::TABLEHEADER_OFF:
1738 case HtmlTokenId::ANCHOR_ON:
1739 case HtmlTokenId::ANCHOR_OFF:
1740 case HtmlTokenId::BOLD_ON:
1741 case HtmlTokenId::BOLD_OFF:
1742 case HtmlTokenId::ITALIC_ON:
1743 case HtmlTokenId::ITALIC_OFF:
1744 case HtmlTokenId::STRIKE_ON:
1745 case HtmlTokenId::STRIKE_OFF:
1746 case HtmlTokenId::STRIKETHROUGH_ON:
1747 case HtmlTokenId::STRIKETHROUGH_OFF:
1748 case HtmlTokenId::UNDERLINE_ON:
1749 case HtmlTokenId::UNDERLINE_OFF:
1750 case HtmlTokenId::BASEFONT_ON:
1751 case HtmlTokenId::BASEFONT_OFF:
1752 case HtmlTokenId::FONT_ON:
1753 case HtmlTokenId::FONT_OFF:
1754 case HtmlTokenId::BLINK_ON:
1755 case HtmlTokenId::BLINK_OFF:
1756 case HtmlTokenId::SPAN_ON:
1757 case HtmlTokenId::SPAN_OFF:
1758 case HtmlTokenId::SUBSCRIPT_ON:
1759 case HtmlTokenId::SUBSCRIPT_OFF:
1760 case HtmlTokenId::SUPERSCRIPT_ON:
1761 case HtmlTokenId::SUPERSCRIPT_OFF:
1762 case HtmlTokenId::BIGPRINT_ON:
1763 case HtmlTokenId::BIGPRINT_OFF:
1764 case HtmlTokenId::SMALLPRINT_OFF:
1765 case HtmlTokenId::SMALLPRINT_ON:
1767 case HtmlTokenId::EMPHASIS_ON:
1768 case HtmlTokenId::EMPHASIS_OFF:
1769 case HtmlTokenId::CITATION_ON:
1770 case HtmlTokenId::CITATION_OFF:
1771 case HtmlTokenId::STRONG_ON:
1772 case HtmlTokenId::STRONG_OFF:
1773 case HtmlTokenId::CODE_ON:
1774 case HtmlTokenId::CODE_OFF:
1775 case HtmlTokenId::SAMPLE_ON:
1776 case HtmlTokenId::SAMPLE_OFF:
1777 case HtmlTokenId::KEYBOARD_ON:
1778 case HtmlTokenId::KEYBOARD_OFF:
1779 case HtmlTokenId::VARIABLE_ON:
1780 case HtmlTokenId::VARIABLE_OFF:
1781 case HtmlTokenId::DEFINSTANCE_ON:
1782 case HtmlTokenId::DEFINSTANCE_OFF:
1783 case HtmlTokenId::SHORTQUOTE_ON:
1784 case HtmlTokenId::SHORTQUOTE_OFF:
1785 case HtmlTokenId::LANGUAGE_ON:
1786 case HtmlTokenId::LANGUAGE_OFF:
1787 case HtmlTokenId::AUTHOR_ON:
1788 case HtmlTokenId::AUTHOR_OFF:
1789 case HtmlTokenId::PERSON_ON:
1790 case HtmlTokenId::PERSON_OFF:
1791 case HtmlTokenId::ACRONYM_ON:
1792 case HtmlTokenId::ACRONYM_OFF:
1793 case HtmlTokenId::ABBREVIATION_ON:
1794 case HtmlTokenId::ABBREVIATION_OFF:
1795 case HtmlTokenId::INSERTEDTEXT_ON:
1796 case HtmlTokenId::INSERTEDTEXT_OFF:
1797 case HtmlTokenId::DELETEDTEXT_ON:
1798 case HtmlTokenId::DELETEDTEXT_OFF:
1799 case HtmlTokenId::TELETYPE_ON:
1800 case HtmlTokenId::TELETYPE_OFF:
1802 break;
1804 // The remainder is treated as an unknown token.
1805 default:
1806 if( nToken != HtmlTokenId::NONE )
1808 nToken =
1809 ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1810 ? HtmlTokenId::UNKNOWNCONTROL_OFF
1811 : HtmlTokenId::UNKNOWNCONTROL_ON );
1813 break;
1816 bPre_IgnoreNewPara = false;
1818 return nToken;
1821 HtmlTokenId HTMLParser::FilterXMP( HtmlTokenId nToken )
1823 switch( nToken )
1825 case HtmlTokenId::NEWPARA:
1826 if( bPre_IgnoreNewPara )
1827 nToken = HtmlTokenId::NONE;
1828 [[fallthrough]];
1829 case HtmlTokenId::TEXTTOKEN:
1830 case HtmlTokenId::NONBREAKSPACE:
1831 case HtmlTokenId::SOFTHYPH:
1832 break; // kept
1834 default:
1835 if( nToken != HtmlTokenId::NONE )
1837 if( (nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken) )
1839 sSaveToken = "</" + sSaveToken;
1841 else
1842 sSaveToken = "<" + sSaveToken;
1843 if( !aToken.isEmpty() )
1845 UnescapeToken();
1846 sSaveToken += " ";
1847 aToken.insert(0, sSaveToken);
1849 else
1850 aToken = sSaveToken;
1851 aToken.append( ">" );
1852 nToken = HtmlTokenId::TEXTTOKEN;
1854 break;
1857 bPre_IgnoreNewPara = false;
1859 return nToken;
1862 HtmlTokenId HTMLParser::FilterListing( HtmlTokenId nToken )
1864 switch( nToken )
1866 case HtmlTokenId::NEWPARA:
1867 if( bPre_IgnoreNewPara )
1868 nToken = HtmlTokenId::NONE;
1869 [[fallthrough]];
1870 case HtmlTokenId::TEXTTOKEN:
1871 case HtmlTokenId::NONBREAKSPACE:
1872 case HtmlTokenId::SOFTHYPH:
1873 break; // kept
1875 default:
1876 if( nToken != HtmlTokenId::NONE )
1878 nToken =
1879 ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1880 ? HtmlTokenId::UNKNOWNCONTROL_OFF
1881 : HtmlTokenId::UNKNOWNCONTROL_ON );
1883 break;
1886 bPre_IgnoreNewPara = false;
1888 return nToken;
1891 bool HTMLParser::InternalImgToPrivateURL( OUString& rURL )
1893 bool bFound = false;
1895 if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) )
1897 OUString aName( rURL.copy(14) );
1898 switch( aName[0] )
1900 case 'b':
1901 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata;
1902 break;
1903 case 'd':
1904 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed;
1905 break;
1906 case 'e':
1907 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed;
1908 break;
1909 case 'i':
1910 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure;
1911 break;
1912 case 'n':
1913 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound;
1914 break;
1917 if( bFound )
1919 OUString sTmp ( rURL );
1920 rURL = OOO_STRING_SVTOOLS_HTML_private_image;
1921 rURL += sTmp;
1924 return bFound;
1927 namespace {
1929 enum class HtmlMeta {
1930 NONE = 0,
1931 Author,
1932 Description,
1933 Keywords,
1934 Refresh,
1935 Classification,
1936 Created,
1937 ChangedBy,
1938 Changed,
1939 Generator,
1940 SDFootnote,
1941 SDEndnote,
1942 ContentType
1947 // <META NAME=xxx>
1948 HTMLOptionEnum<HtmlMeta> const aHTMLMetaNameTable[] =
1950 { OOO_STRING_SVTOOLS_HTML_META_author, HtmlMeta::Author },
1951 { OOO_STRING_SVTOOLS_HTML_META_changed, HtmlMeta::Changed },
1952 { OOO_STRING_SVTOOLS_HTML_META_changedby, HtmlMeta::ChangedBy },
1953 { OOO_STRING_SVTOOLS_HTML_META_classification,HtmlMeta::Classification},
1954 { OOO_STRING_SVTOOLS_HTML_META_content_type, HtmlMeta::ContentType },
1955 { OOO_STRING_SVTOOLS_HTML_META_created, HtmlMeta::Created },
1956 { OOO_STRING_SVTOOLS_HTML_META_description, HtmlMeta::Description },
1957 { OOO_STRING_SVTOOLS_HTML_META_keywords, HtmlMeta::Keywords },
1958 { OOO_STRING_SVTOOLS_HTML_META_generator, HtmlMeta::Generator },
1959 { OOO_STRING_SVTOOLS_HTML_META_refresh, HtmlMeta::Refresh },
1960 { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HtmlMeta::SDEndnote },
1961 { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HtmlMeta::SDFootnote },
1962 { nullptr, HtmlMeta(0) }
1966 void HTMLParser::AddMetaUserDefined( OUString const & )
1970 bool HTMLParser::ParseMetaOptionsImpl(
1971 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
1972 SvKeyValueIterator *i_pHTTPHeader,
1973 const HTMLOptions& aOptions,
1974 rtl_TextEncoding& o_rEnc )
1976 OUString aName, aContent;
1977 HtmlMeta nAction = HtmlMeta::NONE;
1978 bool bHTTPEquiv = false, bChanged = false;
1980 for ( size_t i = aOptions.size(); i; )
1982 const HTMLOption& aOption = aOptions[--i];
1983 switch ( aOption.GetToken() )
1985 case HtmlOptionId::NAME:
1986 aName = aOption.GetString();
1987 if ( HtmlMeta::NONE==nAction )
1989 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1991 break;
1992 case HtmlOptionId::HTTPEQUIV:
1993 aName = aOption.GetString();
1994 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1995 bHTTPEquiv = true;
1996 break;
1997 case HtmlOptionId::CONTENT:
1998 aContent = aOption.GetString();
1999 break;
2000 case HtmlOptionId::CHARSET:
2002 OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
2003 o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr()));
2004 break;
2006 default: break;
2010 if ( bHTTPEquiv || HtmlMeta::Description != nAction )
2012 // if it is not a Description, remove CRs and LFs from CONTENT
2013 aContent = aContent.replaceAll("\r", "").replaceAll("\n", "");
2015 else
2017 // convert line endings for Description
2018 aContent = convertLineEnd(aContent, GetSystemLineEnd());
2021 if ( bHTTPEquiv && i_pHTTPHeader )
2023 // Netscape seems to just ignore a closing ", so we do too
2024 if ( aContent.endsWith("\"") )
2026 aContent = aContent.copy( 0, aContent.getLength() - 1 );
2028 SvKeyValue aKeyValue( aName, aContent );
2029 i_pHTTPHeader->Append( aKeyValue );
2032 switch ( nAction )
2034 case HtmlMeta::Author:
2035 if (i_xDocProps.is()) {
2036 i_xDocProps->setAuthor( aContent );
2037 bChanged = true;
2039 break;
2040 case HtmlMeta::Description:
2041 if (i_xDocProps.is()) {
2042 i_xDocProps->setDescription( aContent );
2043 bChanged = true;
2045 break;
2046 case HtmlMeta::Keywords:
2047 if (i_xDocProps.is()) {
2048 i_xDocProps->setKeywords(
2049 ::comphelper::string::convertCommaSeparated(aContent));
2050 bChanged = true;
2052 break;
2053 case HtmlMeta::Classification:
2054 if (i_xDocProps.is()) {
2055 i_xDocProps->setSubject( aContent );
2056 bChanged = true;
2058 break;
2060 case HtmlMeta::ChangedBy:
2061 if (i_xDocProps.is()) {
2062 i_xDocProps->setModifiedBy( aContent );
2063 bChanged = true;
2065 break;
2067 case HtmlMeta::Created:
2068 case HtmlMeta::Changed:
2069 if (i_xDocProps.is() && !aContent.isEmpty())
2071 ::util::DateTime uDT;
2072 bool valid = false;
2073 if (comphelper::string::getTokenCount(aContent, ';') == 2)
2075 sal_Int32 nIdx{ 0 };
2076 sal_Int32 nDate = o3tl::toInt32(o3tl::getToken(aContent, 0, ';', nIdx));
2077 sal_Int64 nTime = o3tl::toInt64(o3tl::getToken(aContent, 0, ';', nIdx));
2078 valid = nDate != std::numeric_limits<sal_Int32>::min() &&
2079 nTime != std::numeric_limits<sal_Int64>::min();
2080 if (valid)
2082 Date aDate(nDate);
2083 tools::Time aTime(tools::Time::fromEncodedTime(nTime));
2084 uDT = DateTime(aDate, aTime).GetUNODateTime();
2087 else if (utl::ISO8601parseDateTime(aContent, uDT))
2088 valid = true;
2090 if (valid)
2092 bChanged = true;
2093 if (HtmlMeta::Created == nAction)
2094 i_xDocProps->setCreationDate(uDT);
2095 else
2096 i_xDocProps->setModificationDate(uDT);
2099 break;
2101 case HtmlMeta::Refresh:
2102 DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, "Lost Reload-URL because of omitted MUST change." );
2103 break;
2105 case HtmlMeta::ContentType:
2106 if ( !aContent.isEmpty() )
2108 o_rEnc = GetEncodingByMIME( aContent );
2110 break;
2112 case HtmlMeta::NONE:
2113 if ( !bHTTPEquiv )
2115 if (i_xDocProps.is())
2117 uno::Reference<beans::XPropertyContainer> xUDProps
2118 = i_xDocProps->getUserDefinedProperties();
2119 try {
2120 xUDProps->addProperty(aName,
2121 beans::PropertyAttribute::REMOVABLE,
2122 uno::Any(aContent));
2123 AddMetaUserDefined(aName);
2124 bChanged = true;
2125 } catch (uno::Exception &) {
2126 // ignore
2130 break;
2131 default:
2132 break;
2135 return bChanged;
2138 bool HTMLParser::ParseMetaOptions(
2139 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2140 SvKeyValueIterator *i_pHeader )
2142 HtmlOptionId nContentOption = HtmlOptionId::CONTENT;
2143 rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2145 bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2146 GetOptions(&nContentOption),
2147 eEnc );
2149 // If the encoding is set by a META tag, it may only overwrite the
2150 // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2151 // encodings. Everything else cannot lead to reasonable results.
2152 if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2153 rtl_isOctetTextEncoding( eEnc ) &&
2154 rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2156 eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
2157 SetSrcEncoding( eEnc );
2160 return bRet;
2163 rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
2165 OUString sType;
2166 OUString sSubType;
2167 INetContentTypeParameterList aParameters;
2168 if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
2170 auto const iter = aParameters.find("charset"_ostr);
2171 if (iter != aParameters.end())
2173 const INetContentTypeParameter * pCharset = &iter->second;
2174 OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
2175 return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
2178 return RTL_TEXTENCODING_DONTKNOW;
2181 rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2183 rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2184 if( pHTTPHeader )
2186 SvKeyValue aKV;
2187 for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2188 bCont = pHTTPHeader->GetNext( aKV ) )
2190 if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2192 if( !aKV.GetValue().isEmpty() )
2194 eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2199 return eRet;
2202 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
2204 bool bRet = false;
2205 rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2206 if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2208 SetSrcEncoding( eEnc );
2209 bRet = true;
2211 return bRet;
2215 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */