android: Update app-specific/MIME type icons
[LibreOffice.git] / svtools / source / svhtml / parhtml.cxx
blob7e8ac63fc61eee79095dba4c325eeb4010244d26
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <comphelper/string.hxx>
21 #include <o3tl/safeint.hxx>
22 #include <o3tl/string_view.hxx>
23 #include <tools/stream.hxx>
24 #include <tools/debug.hxx>
25 #include <tools/color.hxx>
26 #include <rtl/ustrbuf.hxx>
27 #include <rtl/character.hxx>
28 #include <rtl/tencinfo.h>
29 #include <sal/log.hxx>
30 #include <tools/tenccvt.hxx>
31 #include <tools/datetime.hxx>
32 #include <unotools/datetime.hxx>
33 #include <svl/inettype.hxx>
34 #include <svl/lngmisc.hxx>
35 #include <com/sun/star/beans/PropertyAttribute.hpp>
36 #include <com/sun/star/document/XDocumentProperties.hpp>
38 #include <svtools/parhtml.hxx>
39 #include <svtools/htmltokn.h>
40 #include <svtools/htmlkywd.hxx>
42 #include <utility>
44 using namespace ::com::sun::star;
47 const sal_Int32 MAX_LEN( 1024 );
49 const sal_Int32 MAX_ENTITY_LEN( 8 );
52 // Tables to convert option values into strings
54 // <INPUT TYPE=xxx>
55 HTMLOptionEnum<HTMLInputType> const aInputTypeOptEnums[] =
57 { OOO_STRING_SVTOOLS_HTML_IT_text, HTMLInputType::Text },
58 { OOO_STRING_SVTOOLS_HTML_IT_password, HTMLInputType::Password },
59 { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTMLInputType::Checkbox },
60 { OOO_STRING_SVTOOLS_HTML_IT_radio, HTMLInputType::Radio },
61 { OOO_STRING_SVTOOLS_HTML_IT_range, HTMLInputType::Range },
62 { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTMLInputType::Scribble },
63 { OOO_STRING_SVTOOLS_HTML_IT_file, HTMLInputType::File },
64 { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTMLInputType::Hidden },
65 { OOO_STRING_SVTOOLS_HTML_IT_submit, HTMLInputType::Submit },
66 { OOO_STRING_SVTOOLS_HTML_IT_image, HTMLInputType::Image },
67 { OOO_STRING_SVTOOLS_HTML_IT_reset, HTMLInputType::Reset },
68 { OOO_STRING_SVTOOLS_HTML_IT_button, HTMLInputType::Button },
69 { nullptr, HTMLInputType(0) }
72 // <TABLE FRAME=xxx>
73 HTMLOptionEnum<HTMLTableFrame> const aTableFrameOptEnums[] =
75 { OOO_STRING_SVTOOLS_HTML_TF_void, HTMLTableFrame::Void },
76 { OOO_STRING_SVTOOLS_HTML_TF_above, HTMLTableFrame::Above },
77 { OOO_STRING_SVTOOLS_HTML_TF_below, HTMLTableFrame::Below },
78 { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTMLTableFrame::HSides },
79 { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTMLTableFrame::LHS },
80 { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTMLTableFrame::RHS },
81 { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTMLTableFrame::VSides },
82 { OOO_STRING_SVTOOLS_HTML_TF_box, HTMLTableFrame::Box },
83 { OOO_STRING_SVTOOLS_HTML_TF_border, HTMLTableFrame::Box },
84 { nullptr, HTMLTableFrame(0) }
87 // <TABLE RULES=xxx>
88 HTMLOptionEnum<HTMLTableRules> const aTableRulesOptEnums[] =
90 { OOO_STRING_SVTOOLS_HTML_TR_none, HTMLTableRules::NONE },
91 { OOO_STRING_SVTOOLS_HTML_TR_groups, HTMLTableRules::Groups },
92 { OOO_STRING_SVTOOLS_HTML_TR_rows, HTMLTableRules::Rows },
93 { OOO_STRING_SVTOOLS_HTML_TR_cols, HTMLTableRules::Cols },
94 { OOO_STRING_SVTOOLS_HTML_TR_all, HTMLTableRules::All },
95 { nullptr, HTMLTableRules(0) }
99 HTMLOption::HTMLOption( HtmlOptionId nTok, OUString _aToken,
100 OUString _aValue )
101 : aValue(std::move(_aValue))
102 , aToken(std::move(_aToken))
103 , nToken( nTok )
105 DBG_ASSERT( nToken>=HtmlOptionId::BOOL_START && nToken<HtmlOptionId::END,
106 "HTMLOption: unknown token" );
109 sal_uInt32 HTMLOption::GetNumber() const
111 DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START &&
112 nToken<HtmlOptionId::NUMBER_END) ||
113 (nToken>=HtmlOptionId::CONTEXT_START &&
114 nToken<HtmlOptionId::CONTEXT_END) ||
115 nToken==HtmlOptionId::VALUE,
116 "GetNumber: Option not numerical" );
117 OUString aTmp(comphelper::string::stripStart(aValue, ' '));
118 sal_Int32 nTmp = aTmp.toInt32();
119 return nTmp >= 0 ? static_cast<sal_uInt32>(nTmp) : 0;
122 sal_Int32 HTMLOption::GetSNumber() const
124 DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START && nToken<HtmlOptionId::NUMBER_END) ||
125 (nToken>=HtmlOptionId::CONTEXT_START && nToken<HtmlOptionId::CONTEXT_END),
126 "GetSNumber: Option not numerical" );
127 OUString aTmp(comphelper::string::stripStart(aValue, ' '));
128 return aTmp.toInt32();
131 void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers ) const
133 rNumbers.clear();
135 // This is a very simplified scanner: it only searches all
136 // numerals in the string.
137 bool bInNum = false;
138 sal_uInt32 nNum = 0;
139 for( sal_Int32 i=0; i<aValue.getLength(); i++ )
141 sal_Unicode c = aValue[ i ];
142 if( c>='0' && c<='9' )
144 nNum *= 10;
145 nNum += (c - '0');
146 bInNum = true;
148 else if( bInNum )
150 rNumbers.push_back( nNum );
151 bInNum = false;
152 nNum = 0;
155 if( bInNum )
157 rNumbers.push_back( nNum );
161 void HTMLOption::GetColor( Color& rColor ) const
163 DBG_ASSERT( (nToken>=HtmlOptionId::COLOR_START && nToken<HtmlOptionId::COLOR_END) || nToken==HtmlOptionId::SIZE,
164 "GetColor: Option is not a color." );
166 OUString aTmp(aValue.toAsciiLowerCase());
167 sal_uInt32 nColor = SAL_MAX_UINT32;
168 if (!aTmp.isEmpty() && aTmp[0] != '#')
169 nColor = GetHTMLColor(aTmp);
171 if( SAL_MAX_UINT32 == nColor )
173 nColor = 0;
174 sal_Int32 nPos = 0;
175 for (sal_uInt32 i=0; i<6; ++i)
177 // Whatever Netscape does to get color values,
178 // at maximum three characters < '0' are ignored.
179 sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
180 if( c < '0' )
182 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
183 if( c < '0' )
184 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
186 nColor *= 16;
187 if( c >= '0' && c <= '9' )
188 nColor += (c - '0');
189 else if( c >= 'a' && c <= 'f' )
190 nColor += (c + 0xa - 'a');
194 rColor.SetRed( static_cast<sal_uInt8>((nColor & 0x00ff0000) >> 16) );
195 rColor.SetGreen( static_cast<sal_uInt8>((nColor & 0x0000ff00) >> 8));
196 rColor.SetBlue( static_cast<sal_uInt8>(nColor & 0x000000ff) );
199 HTMLInputType HTMLOption::GetInputType() const
201 DBG_ASSERT( nToken==HtmlOptionId::TYPE, "GetInputType: Option not TYPE" );
202 return GetEnum( aInputTypeOptEnums, HTMLInputType::Text );
205 HTMLTableFrame HTMLOption::GetTableFrame() const
207 DBG_ASSERT( nToken==HtmlOptionId::FRAME, "GetTableFrame: Option not FRAME" );
208 return GetEnum( aTableFrameOptEnums );
211 HTMLTableRules HTMLOption::GetTableRules() const
213 DBG_ASSERT( nToken==HtmlOptionId::RULES, "GetTableRules: Option not RULES" );
214 return GetEnum( aTableRulesOptEnums );
217 HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
218 SvParser<HtmlTokenId>( rIn ),
219 bNewDoc(bReadNewDoc),
220 bIsInHeader(true),
221 bReadListing(false),
222 bReadXMP(false),
223 bReadPRE(false),
224 bReadTextArea(false),
225 bReadScript(false),
226 bReadStyle(false),
227 bEndTokenFound(false),
228 bPre_IgnoreNewPara(false),
229 bReadNextChar(false),
230 bReadComment(false),
231 nPre_LinePos(0),
232 mnPendingOffToken(HtmlTokenId::NONE)
234 //#i76649, default to UTF-8 for HTML unless we know differently
235 SetSrcEncoding(RTL_TEXTENCODING_UTF8);
238 HTMLParser::~HTMLParser()
242 void HTMLParser::SetNamespace(std::u16string_view rNamespace)
244 // Convert namespace alias to a prefix.
245 maNamespace = OUString::Concat(rNamespace) + ":";
248 namespace
250 class RefGuard
252 private:
253 HTMLParser& m_rParser;
254 public:
255 RefGuard(HTMLParser& rParser)
256 : m_rParser(rParser)
258 m_rParser.AddFirstRef();
261 ~RefGuard()
263 if (m_rParser.GetStatus() != SvParserState::Pending)
264 m_rParser.ReleaseRef(); // Parser not needed anymore
269 SvParserState HTMLParser::CallParser()
271 eState = SvParserState::Working;
272 nNextCh = GetNextChar();
273 SaveState( HtmlTokenId::NONE );
275 nPre_LinePos = 0;
276 bPre_IgnoreNewPara = false;
278 RefGuard aRefGuard(*this);
280 Continue( HtmlTokenId::NONE );
282 return eState;
285 void HTMLParser::Continue( HtmlTokenId nToken )
287 if( nToken == HtmlTokenId::NONE )
288 nToken = GetNextToken();
290 while( IsParserWorking() )
292 SaveState( nToken );
293 nToken = FilterToken( nToken );
295 if( nToken != HtmlTokenId::NONE )
296 NextToken( nToken );
298 if( IsParserWorking() )
299 SaveState( HtmlTokenId::NONE ); // continue with new token
301 nToken = GetNextToken();
305 HtmlTokenId HTMLParser::FilterToken( HtmlTokenId nToken )
307 switch( nToken )
309 case HtmlTokenId(EOF):
310 nToken = HtmlTokenId::NONE;
311 break; // don't pass
313 case HtmlTokenId::HEAD_OFF:
314 bIsInHeader = false;
315 break;
317 case HtmlTokenId::HEAD_ON:
318 bIsInHeader = true;
319 break;
321 case HtmlTokenId::BODY_ON:
322 bIsInHeader = false;
323 break;
325 case HtmlTokenId::FRAMESET_ON:
326 bIsInHeader = false;
327 break;
329 case HtmlTokenId::BODY_OFF:
330 bReadPRE = bReadListing = bReadXMP = false;
331 break;
333 case HtmlTokenId::HTML_OFF:
334 nToken = HtmlTokenId::NONE;
335 bReadPRE = bReadListing = bReadXMP = false;
336 break; // HtmlTokenId::ON hasn't been passed either !
338 case HtmlTokenId::PREFORMTXT_ON:
339 StartPRE();
340 break;
342 case HtmlTokenId::PREFORMTXT_OFF:
343 FinishPRE();
344 break;
346 case HtmlTokenId::LISTING_ON:
347 StartListing();
348 break;
350 case HtmlTokenId::LISTING_OFF:
351 FinishListing();
352 break;
354 case HtmlTokenId::XMP_ON:
355 StartXMP();
356 break;
358 case HtmlTokenId::XMP_OFF:
359 FinishXMP();
360 break;
362 default:
363 if( bReadPRE )
364 nToken = FilterPRE( nToken );
365 else if( bReadListing )
366 nToken = FilterListing( nToken );
367 else if( bReadXMP )
368 nToken = FilterXMP( nToken );
370 break;
373 return nToken;
376 namespace {
378 constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; }
382 HtmlTokenId HTMLParser::ScanText( const sal_Unicode cBreak )
384 OUStringBuffer sTmpBuffer( MAX_LEN );
385 bool bContinue = true;
386 bool bEqSignFound = false;
387 sal_uInt32 cQuote = 0U;
389 while( bContinue && IsParserWorking() )
391 bool bNextCh = true;
392 switch( nNextCh )
394 case '&':
395 bEqSignFound = false;
396 if( bReadXMP )
397 sTmpBuffer.append( '&' );
398 else
400 sal_uInt64 nStreamPos = rInput.Tell();
401 sal_uInt32 nLinePos = GetLinePos();
403 sal_uInt32 cChar = 0U;
404 if( '#' == (nNextCh = GetNextChar()) )
406 nNextCh = GetNextChar();
407 const bool bIsHex( 'x' == nNextCh );
408 const bool bIsDecOrHex( bIsHex || rtl::isAsciiDigit(nNextCh) );
409 if ( bIsDecOrHex )
411 if ( bIsHex )
413 nNextCh = GetNextChar();
414 while ( rtl::isAsciiHexDigit(nNextCh) )
416 cChar = cChar * 16U +
417 ( nNextCh <= '9'
418 ? sal_uInt32( nNextCh - '0' )
419 : ( nNextCh <= 'F'
420 ? sal_uInt32( nNextCh - 'A' + 10 )
421 : sal_uInt32( nNextCh - 'a' + 10 ) ) );
422 nNextCh = GetNextChar();
425 else
429 cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
430 nNextCh = GetNextChar();
432 while( rtl::isAsciiDigit(nNextCh) );
435 if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
436 RTL_TEXTENCODING_UCS2 != eSrcEnc &&
437 RTL_TEXTENCODING_UTF8 != eSrcEnc &&
438 cChar < 256 )
440 const sal_uInt32 convertFlags =
441 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
442 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
443 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
445 char cEncodedChar = static_cast<char>(cChar);
446 cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
447 if( 0U == cChar )
449 // If the character could not be
450 // converted, because a conversion is not
451 // available, do no conversion at all.
452 cChar = cEncodedChar;
456 else
457 nNextCh = 0U;
459 if (!rtl::isUnicodeCodePoint(cChar)
460 || (linguistic::IsControlChar(cChar)
461 && cChar != '\r' && cChar != '\n' && cChar != '\t'))
463 cChar = '?';
466 else if( rtl::isAsciiAlpha( nNextCh ) )
468 OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
469 sal_Int32 nPos = 0;
472 sEntityBuffer.appendUtf32( nNextCh );
473 nPos++;
474 nNextCh = GetNextChar();
476 while( nPos < MAX_ENTITY_LEN && rtl::isAsciiAlphanumeric( nNextCh ) &&
477 !rInput.eof() );
479 if( IsParserWorking() && !rInput.eof() )
481 std::u16string_view sEntity(sEntityBuffer.subView(0, nPos));
482 cChar = GetHTMLCharName( sEntity );
484 // not found ( == 0 ): plain text
485 // or a character which is inserted as attribute
486 if( 0U == cChar && ';' != nNextCh )
488 DBG_ASSERT( rInput.Tell() - nStreamPos ==
489 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
490 "UTF-8 is failing here" );
491 for( sal_Int32 i = nPos-1; i>1; i-- )
493 nNextCh = sEntityBuffer[i];
494 sEntityBuffer.setLength( i );
495 sEntity = sEntityBuffer.subView(0, i);
496 cChar = GetHTMLCharName( sEntity );
497 if( cChar )
499 rInput.SeekRel( -static_cast<sal_Int64>
500 (nPos-i)*GetCharSize() );
501 nlLinePos -= sal_uInt32(nPos-i);
502 nPos = i;
503 ClearTxtConvContext();
504 break;
509 if( !cChar ) // unknown character?
511 // back in stream, insert '&'
512 // and restart with next character
513 sTmpBuffer.append( '&' );
515 DBG_ASSERT( rInput.Tell()-nStreamPos ==
516 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
517 "Wrong stream position" );
518 DBG_ASSERT( nlLinePos-nLinePos ==
519 static_cast<sal_uInt32>(nPos+1),
520 "Wrong line position" );
521 rInput.Seek( nStreamPos );
522 nlLinePos = nLinePos;
523 ClearTxtConvContext();
524 break;
527 assert(cChar != 0);
529 // 1 == Non Breaking Space
530 // 2 == SoftHyphen
532 if (cChar == 1 || cChar == 2)
534 if( '>' == cBreak )
536 // When reading the content of a tag we have
537 // to change it to ' ' or '-'
538 if( 1U == cChar )
539 cChar = ' ';
540 else //2U
541 cChar = '-';
543 else
545 // If not scanning a tag return token
546 aToken.append( sTmpBuffer );
547 sTmpBuffer.setLength(0);
549 if( !aToken.isEmpty() )
551 // restart with character
552 nNextCh = '&';
553 DBG_ASSERT( rInput.Tell()-nStreamPos ==
554 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
555 "Wrong stream position" );
556 DBG_ASSERT( nlLinePos-nLinePos ==
557 static_cast<sal_uInt32>(nPos+1),
558 "Wrong line position" );
559 rInput.Seek( nStreamPos );
560 nlLinePos = nLinePos;
561 ClearTxtConvContext();
562 return HtmlTokenId::TEXTTOKEN;
565 // Hack: _GetNextChar shall not read the
566 // next character
567 if( ';' != nNextCh )
568 aToken.append( " " );
569 if( 1U == cChar )
570 return HtmlTokenId::NONBREAKSPACE;
571 else //2U
572 return HtmlTokenId::SOFTHYPH;
576 else
577 nNextCh = 0U;
579 // &{...};-JavaScript-Macros are not supported any longer.
580 else if( IsParserWorking() )
582 sTmpBuffer.append( '&' );
583 bNextCh = false;
584 break;
587 bNextCh = (';' == nNextCh);
588 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
589 cChar=='\"' || cChar==' ') )
591 // ' and " have to be escaped within tags to separate
592 // them from ' and " enclosing options.
593 // \ has to be escaped as well.
594 // Space is protected because it's not a delimiter between
595 // options.
596 sTmpBuffer.append( '\\' );
598 if( IsParserWorking() )
600 if( cChar )
601 sTmpBuffer.appendUtf32( cChar );
603 else if( SvParserState::Pending==eState && '>'!=cBreak )
605 // Restart with '&', the remainder is returned as
606 // text token.
607 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
609 // _GetNextChar() returns the previous text and
610 // during the next execution a new character is read.
611 // Thus we have to position in front of the '&'.
612 nNextCh = 0U;
613 rInput.Seek( nStreamPos - GetCharSize() );
614 nlLinePos = nLinePos-1;
615 ClearTxtConvContext();
616 bReadNextChar = true;
618 bNextCh = false;
621 break;
622 case '=':
623 if( '>'==cBreak && !cQuote )
624 bEqSignFound = true;
625 sTmpBuffer.appendUtf32( nNextCh );
626 break;
628 case '\\':
629 if( '>'==cBreak )
631 // mark within tags
632 sTmpBuffer.append( '\\' );
634 sTmpBuffer.append( '\\' );
635 break;
637 case '\"':
638 case '\'':
639 if( '>'==cBreak )
641 if( bEqSignFound )
642 cQuote = nNextCh;
643 else if( cQuote && (cQuote==nNextCh ) )
644 cQuote = 0U;
646 sTmpBuffer.appendUtf32( nNextCh );
647 bEqSignFound = false;
648 break;
650 case sal_Unicode(EOF):
651 if( rInput.eof() )
653 bContinue = false;
655 // else: ignore, not a valid code point
656 break;
658 case '<':
659 bEqSignFound = false;
660 if( '>'==cBreak )
661 sTmpBuffer.appendUtf32( nNextCh );
662 else
663 bContinue = false; // break, string is together
664 break;
666 case '\f':
667 if( '>' == cBreak )
669 // If scanning options treat it like a space, ...
670 sTmpBuffer.append( ' ' );
672 else
674 // otherwise it's a separate token.
675 bContinue = false;
677 break;
679 case '\r':
680 case '\n':
681 if( '>'==cBreak )
683 // cr/lf in tag is handled in GetNextToken_()
684 sTmpBuffer.appendUtf32( nNextCh );
685 break;
687 else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
689 bContinue = false;
690 break;
692 // Reduce sequence of CR/LF/BLANK/TAB to a single blank
693 [[fallthrough]];
694 case '\t':
695 if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
697 // Pass Tabs up in <PRE>
698 bContinue = false;
699 break;
701 [[fallthrough]];
702 case '\x0b':
703 if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
704 '>'!=cBreak )
706 break;
708 nNextCh = ' ';
709 [[fallthrough]];
710 case ' ':
711 sTmpBuffer.appendUtf32( nNextCh );
712 if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
713 !bReadPRE && !bReadTextArea) )
715 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
716 do {
717 nNextCh = GetNextChar();
718 if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
720 if( !aToken.isEmpty() || sTmpBuffer.getLength() > 1 )
722 // Have seen s.th. aside from blanks?
723 aToken.append( sTmpBuffer );
724 sTmpBuffer.setLength(0);
725 return HtmlTokenId::TEXTTOKEN;
727 else
728 // Only read blanks: no text must be returned
729 // and GetNextToken_ has to read until EOF
730 return HtmlTokenId::NONE;
732 } while ( ' ' == nNextCh || '\t' == nNextCh ||
733 '\r' == nNextCh || '\n' == nNextCh ||
734 '\x0b' == nNextCh );
735 bNextCh = false;
737 break;
739 default:
740 bEqSignFound = false;
741 if (nNextCh == cBreak && !cQuote)
742 bContinue = false;
743 else
745 do {
746 if (!linguistic::IsControlChar(nNextCh))
748 // All remaining characters make their way into the text.
749 sTmpBuffer.appendUtf32( nNextCh );
752 nNextCh = GetNextChar();
753 if( ( sal_Unicode(EOF) == nNextCh && rInput.eof() ) ||
754 !IsParserWorking() )
756 if( !sTmpBuffer.isEmpty() )
757 aToken.append( sTmpBuffer );
758 return HtmlTokenId::TEXTTOKEN;
760 } while( rtl::isAsciiAlpha( nNextCh ) || rtl::isAsciiDigit( nNextCh ) );
761 bNextCh = false;
765 if( bContinue && bNextCh )
766 nNextCh = GetNextChar();
769 if( !sTmpBuffer.isEmpty() )
770 aToken.append( sTmpBuffer );
772 return HtmlTokenId::TEXTTOKEN;
775 HtmlTokenId HTMLParser::GetNextRawToken()
777 OUStringBuffer sTmpBuffer( MAX_LEN );
779 if( bEndTokenFound )
781 // During the last execution we already found the end token,
782 // thus we don't have to search it again.
783 bReadScript = false;
784 bReadStyle = false;
785 aEndToken.clear();
786 bEndTokenFound = false;
788 return HtmlTokenId::NONE;
791 // Default return value: HtmlTokenId::RAWDATA
792 bool bContinue = true;
793 HtmlTokenId nToken = HtmlTokenId::RAWDATA;
794 SaveState( HtmlTokenId::NONE );
795 while( bContinue && IsParserWorking() )
797 bool bNextCh = true;
798 switch( nNextCh )
800 case '<':
802 // Maybe we've reached the end.
804 // Save what we have read previously...
805 aToken.append( sTmpBuffer );
806 sTmpBuffer.setLength(0);
808 // and remember position in stream.
809 sal_uInt64 nStreamPos = rInput.Tell();
810 sal_uInt32 nLineNr = GetLineNr();
811 sal_uInt32 nLinePos = GetLinePos();
813 // Start of an end token?
814 bool bOffState = false;
815 if( '/' == (nNextCh = GetNextChar()) )
817 bOffState = true;
818 nNextCh = GetNextChar();
820 else if( '!' == nNextCh )
822 sTmpBuffer.appendUtf32( nNextCh );
823 nNextCh = GetNextChar();
826 // Read following letters
827 while( (rtl::isAsciiAlpha(nNextCh) || '-'==nNextCh) &&
828 IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
830 sTmpBuffer.appendUtf32( nNextCh );
831 nNextCh = GetNextChar();
834 OUString aTok( sTmpBuffer.toString() );
835 aTok = aTok.toAsciiLowerCase();
836 bool bDone = false;
837 if( bReadScript || !aEndToken.isEmpty() )
839 if( !bReadComment )
841 if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) )
843 bReadComment = true;
845 else
847 // A script has to end with "</SCRIPT>". But
848 // ">" is optional for security reasons
849 bDone = bOffState &&
850 ( bReadScript
851 ? aTok == OOO_STRING_SVTOOLS_HTML_script
852 : aTok == aEndToken );
855 if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
857 // End of comment of style <!----->
858 bReadComment = false;
861 else
863 // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
864 if( bOffState )
865 bDone = aTok == OOO_STRING_SVTOOLS_HTML_style ||
866 aTok == OOO_STRING_SVTOOLS_HTML_head;
867 else
868 bDone = aTok == OOO_STRING_SVTOOLS_HTML_body;
871 if( bDone )
873 // Done! Return the previously read string (if requested)
874 // and continue.
876 bContinue = false;
878 // nToken==0 means, GetNextToken_ continues to read
879 if( aToken.isEmpty() && (bReadStyle || bReadScript) )
881 // Immediately close environment (or context?)
882 // and parse the end token
883 bReadScript = false;
884 bReadStyle = false;
885 aEndToken.clear();
886 nToken = HtmlTokenId::NONE;
888 else
890 // Keep bReadScript/bReadStyle alive
891 // and parse end token during next execution
892 bEndTokenFound = true;
895 // Move backwards in stream to '<'
896 rInput.Seek( nStreamPos );
897 SetLineNr( nLineNr );
898 SetLinePos( nLinePos );
899 ClearTxtConvContext();
900 nNextCh = '<';
902 // Don't append string to token.
903 sTmpBuffer.setLength( 0 );
905 else
907 // remember "</" , everything else we find in the buffer
908 aToken.append( "<" );
909 if( bOffState )
910 aToken.append( "/" );
912 bNextCh = false;
915 break;
916 case '-':
917 sTmpBuffer.appendUtf32( nNextCh );
918 if( bReadComment )
920 bool bTwoMinus = false;
921 nNextCh = GetNextChar();
922 while( '-' == nNextCh && IsParserWorking() )
924 bTwoMinus = true;
925 sTmpBuffer.appendUtf32( nNextCh );
926 nNextCh = GetNextChar();
929 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
930 bReadComment = false;
932 bNextCh = false;
934 break;
936 case '\r':
937 // \r\n? closes the current text token (even if it's empty)
938 nNextCh = GetNextChar();
939 if( nNextCh=='\n' )
940 nNextCh = GetNextChar();
941 bContinue = false;
942 break;
943 case '\n':
944 // \n closes the current text token (even if it's empty)
945 nNextCh = GetNextChar();
946 bContinue = false;
947 break;
948 case sal_Unicode(EOF):
949 // eof closes the current text token and behaves like having read
950 // an end token
951 if( rInput.eof() )
953 bContinue = false;
954 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
956 bEndTokenFound = true;
958 else
960 bReadScript = false;
961 bReadStyle = false;
962 aEndToken.clear();
963 nToken = HtmlTokenId::NONE;
966 break;
967 default:
968 if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t')
970 // all remaining characters are appended to the buffer
971 sTmpBuffer.appendUtf32( nNextCh );
973 break;
976 if( !bContinue && !sTmpBuffer.isEmpty() )
978 aToken.append( sTmpBuffer );
979 sTmpBuffer.setLength(0);
982 if( bContinue && bNextCh )
983 nNextCh = GetNextChar();
986 if( IsParserWorking() )
987 SaveState( HtmlTokenId::NONE );
988 else
989 nToken = HtmlTokenId::NONE;
991 return nToken;
994 // Scan next token
995 HtmlTokenId HTMLParser::GetNextToken_()
997 HtmlTokenId nRet = HtmlTokenId::NONE;
998 sSaveToken.clear();
1000 if (mnPendingOffToken != HtmlTokenId::NONE)
1002 // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
1003 nRet = mnPendingOffToken;
1004 mnPendingOffToken = HtmlTokenId::NONE;
1005 aToken.setLength( 0 );
1006 return nRet;
1009 // Delete options
1010 maOptions.clear();
1012 if( !IsParserWorking() ) // Don't continue if already an error occurred
1013 return HtmlTokenId::NONE;
1015 bool bReadNextCharSave = bReadNextChar;
1016 if( bReadNextChar )
1018 DBG_ASSERT( !bEndTokenFound,
1019 "Read a character despite </SCRIPT> was read?" );
1020 nNextCh = GetNextChar();
1021 if( !IsParserWorking() ) // Don't continue if already an error occurred
1022 return HtmlTokenId::NONE;
1023 bReadNextChar = false;
1026 if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
1028 nRet = GetNextRawToken();
1029 if( nRet != HtmlTokenId::NONE || !IsParserWorking() )
1030 return nRet;
1033 do {
1034 bool bNextCh = true;
1035 switch( nNextCh )
1037 case '<':
1039 sal_uInt64 nStreamPos = rInput.Tell();
1040 sal_uInt32 nLineNr = GetLineNr();
1041 sal_uInt32 nLinePos = GetLinePos();
1043 bool bOffState = false;
1044 if( '/' == (nNextCh = GetNextChar()) )
1046 bOffState = true;
1047 nNextCh = GetNextChar();
1049 // Assume '<?' is a start of an XML declaration, ignore it.
1050 if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?')
1052 OUStringBuffer sTmpBuffer;
1053 do {
1054 sTmpBuffer.appendUtf32( nNextCh );
1055 nNextCh = GetNextChar();
1056 if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
1057 break;
1058 if (bFuzzing && sTmpBuffer.getLength() > 1024)
1060 SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
1061 eState = SvParserState::Error;
1062 break;
1064 } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
1065 !linguistic::IsControlChar(nNextCh) &&
1066 IsParserWorking() && !rInput.eof() );
1068 if( !sTmpBuffer.isEmpty() )
1070 aToken.append( sTmpBuffer );
1071 sTmpBuffer.setLength(0);
1074 // Skip blanks
1075 while( rtl::isAsciiWhiteSpace( nNextCh ) && IsParserWorking() )
1076 nNextCh = GetNextChar();
1078 if( !IsParserWorking() )
1080 if( SvParserState::Pending == eState )
1081 bReadNextChar = bReadNextCharSave;
1082 break;
1085 // Search token in table:
1086 sSaveToken = aToken;
1087 aToken = aToken.toString().toAsciiLowerCase();
1089 if (!maNamespace.isEmpty() && o3tl::starts_with(aToken, maNamespace))
1090 aToken.remove( 0, maNamespace.getLength());
1092 if( HtmlTokenId::NONE == (nRet = GetHTMLToken( aToken )) )
1093 // Unknown control
1094 nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1096 // If it's a token which can be switched off...
1097 if( bOffState )
1099 if( nRet >= HtmlTokenId::ONOFF_START )
1101 // and there is an off token, return off token instead
1102 nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);
1104 else if( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty())
1106 // and there is no off token, return unknown token.
1107 // (except for </BR>, that is treated like <BR>)
1108 // No exception for XHTML, though.
1109 nRet = HtmlTokenId::UNKNOWNCONTROL_OFF;
1113 if( nRet == HtmlTokenId::COMMENT )
1115 // fix: due to being case sensitive use sSaveToken as start of comment
1116 // and append a blank.
1117 aToken = sSaveToken;
1118 if( '>'!=nNextCh )
1119 aToken.append( " " );
1120 sal_uInt64 nCStreamPos = 0;
1121 sal_uInt32 nCLineNr = 0;
1122 sal_uInt32 nCLinePos = 0;
1123 sal_Int32 nCStrLen = 0;
1125 bool bDone = false;
1126 // Read until closing -->. If not found restart at first >
1127 sTmpBuffer = aToken;
1128 while( !bDone && !rInput.eof() && IsParserWorking() )
1130 if( '>'==nNextCh )
1132 if( !nCStreamPos )
1134 nCStreamPos = rInput.Tell();
1135 nCStrLen = sTmpBuffer.getLength();
1136 nCLineNr = GetLineNr();
1137 nCLinePos = GetLinePos();
1139 bDone = sTmpBuffer.getLength() >= 2 && sTmpBuffer[sTmpBuffer.getLength() - 2] == '-' && sTmpBuffer[sTmpBuffer.getLength() - 1] == '-';
1140 if( !bDone )
1141 sTmpBuffer.appendUtf32(nNextCh);
1143 else if (!linguistic::IsControlChar(nNextCh)
1144 || nNextCh == '\r' || nNextCh == '\n' || nNextCh == '\t')
1146 sTmpBuffer.appendUtf32(nNextCh);
1148 if( !bDone )
1149 nNextCh = GetNextChar();
1151 aToken = sTmpBuffer;
1152 sTmpBuffer.setLength(0);
1153 if( !bDone && IsParserWorking() && nCStreamPos )
1155 rInput.Seek( nCStreamPos );
1156 SetLineNr( nCLineNr );
1157 SetLinePos( nCLinePos );
1158 ClearTxtConvContext();
1159 aToken.truncate(nCStrLen);
1160 nNextCh = '>';
1163 else if (nRet == HtmlTokenId::CDATA)
1165 // Read until the closing ]]>.
1166 bool bDone = false;
1167 while (!bDone && !rInput.eof() && IsParserWorking())
1169 if (nNextCh == '>')
1171 if (sTmpBuffer.getLength() >= 2)
1173 bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
1174 && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
1175 if (bDone)
1177 // Ignore ]] at the end.
1178 sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
1181 if (!bDone)
1183 sTmpBuffer.appendUtf32(nNextCh);
1186 else if (!linguistic::IsControlChar(nNextCh))
1188 sTmpBuffer.appendUtf32(nNextCh);
1190 if (!bDone)
1192 nNextCh = GetNextChar();
1195 aToken = sTmpBuffer;
1196 sTmpBuffer.setLength(0);
1198 else
1200 // TokenString not needed anymore
1201 aToken.setLength( 0 );
1204 // Read until closing '>'
1205 if( '>' != nNextCh && IsParserWorking() )
1207 ScanText( '>' );
1209 // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1210 // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
1211 // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
1212 // which lead to fdo#56772.
1213 if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/"))
1215 mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1); // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
1216 aToken.setLength( aToken.getLength()-1 ); // remove trailing '/'
1218 if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1220 // Move back in front of < and restart there.
1221 // Return < as text.
1222 rInput.Seek( nStreamPos );
1223 SetLineNr( nLineNr );
1224 SetLinePos( nLinePos );
1225 ClearTxtConvContext();
1227 aToken = "<";
1228 nRet = HtmlTokenId::TEXTTOKEN;
1229 nNextCh = GetNextChar();
1230 bNextCh = false;
1231 break;
1234 if( SvParserState::Pending == eState )
1235 bReadNextChar = bReadNextCharSave;
1237 else
1239 if( bOffState )
1241 // simply throw away everything
1242 ScanText( '>' );
1243 if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1245 // Move back in front of < and restart there.
1246 // Return < as text.
1247 rInput.Seek( nStreamPos );
1248 SetLineNr( nLineNr );
1249 SetLinePos( nLinePos );
1250 ClearTxtConvContext();
1252 aToken = "<";
1253 nRet = HtmlTokenId::TEXTTOKEN;
1254 nNextCh = GetNextChar();
1255 bNextCh = false;
1256 break;
1258 if( SvParserState::Pending == eState )
1259 bReadNextChar = bReadNextCharSave;
1260 aToken.setLength( 0 );
1262 else if( '%' == nNextCh )
1264 nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1266 sal_uInt64 nCStreamPos = rInput.Tell();
1267 sal_uInt32 nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1269 bool bDone = false;
1270 // Read until closing %>. If not found restart at first >.
1271 sal_Unicode nLastTokenChar = !aToken.isEmpty() ? aToken[aToken.getLength() - 1] : 0;
1272 OUStringBuffer aTmpBuffer(aToken);
1273 while( !bDone && !rInput.eof() && IsParserWorking() )
1275 bDone = '>'==nNextCh && nLastTokenChar == '%';
1276 if( !bDone )
1278 aTmpBuffer.appendUtf32(nNextCh);
1279 nLastTokenChar = aTmpBuffer[aTmpBuffer.getLength() - 1];
1280 nNextCh = GetNextChar();
1283 if( !bDone && IsParserWorking() )
1285 rInput.Seek( nCStreamPos );
1286 SetLineNr( nCLineNr );
1287 SetLinePos( nCLinePos );
1288 ClearTxtConvContext();
1289 aToken = "<%";
1290 nRet = HtmlTokenId::TEXTTOKEN;
1291 break;
1293 aToken = aTmpBuffer;
1294 aTmpBuffer.setLength(0);
1295 if( IsParserWorking() )
1297 sSaveToken = aToken;
1298 aToken.setLength( 0 );
1301 else
1303 aToken = "<";
1304 nRet = HtmlTokenId::TEXTTOKEN;
1305 bNextCh = false;
1306 break;
1310 if( IsParserWorking() )
1312 bNextCh = '>' == nNextCh;
1313 switch( nRet )
1315 case HtmlTokenId::TEXTAREA_ON:
1316 bReadTextArea = true;
1317 break;
1318 case HtmlTokenId::TEXTAREA_OFF:
1319 bReadTextArea = false;
1320 break;
1321 case HtmlTokenId::SCRIPT_ON:
1322 if( !bReadTextArea )
1323 bReadScript = true;
1324 break;
1325 case HtmlTokenId::SCRIPT_OFF:
1326 if( !bReadTextArea )
1328 bReadScript = false;
1329 // JavaScript might modify the stream,
1330 // thus the last character has to be read again.
1331 bReadNextChar = true;
1332 bNextCh = false;
1334 break;
1336 case HtmlTokenId::STYLE_ON:
1337 bReadStyle = true;
1338 break;
1339 case HtmlTokenId::STYLE_OFF:
1340 bReadStyle = false;
1341 break;
1342 default: break;
1346 break;
1348 case sal_Unicode(EOF):
1349 if( rInput.eof() )
1351 eState = SvParserState::Accepted;
1352 nRet = HtmlTokenId(nNextCh);
1354 else
1356 // Read normal text.
1357 goto scan_text;
1359 break;
1361 case '\f':
1362 // form feeds are passed upwards separately
1363 nRet = HtmlTokenId::LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
1364 break;
1366 case '\n':
1367 case '\r':
1368 if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1370 sal_Unicode c = GetNextChar();
1371 if( ( '\n' != nNextCh || '\r' != c ) &&
1372 ( '\r' != nNextCh || '\n' != c ) )
1374 bNextCh = false;
1375 nNextCh = c;
1377 nRet = HtmlTokenId::NEWPARA;
1378 break;
1380 [[fallthrough]];
1381 case '\t':
1382 if( bReadPRE )
1384 nRet = HtmlTokenId::TABCHAR;
1385 break;
1387 [[fallthrough]];
1388 case ' ':
1389 [[fallthrough]];
1390 default:
1392 scan_text:
1393 // "normal" text to come
1394 nRet = ScanText();
1395 bNextCh = 0 == aToken.getLength();
1397 // the text should be processed
1398 if( !bNextCh && eState == SvParserState::Pending )
1400 eState = SvParserState::Working;
1401 bReadNextChar = true;
1404 break;
1407 if( bNextCh && SvParserState::Working == eState )
1409 nNextCh = GetNextChar();
1410 if( SvParserState::Pending == eState && nRet != HtmlTokenId::NONE && HtmlTokenId::TEXTTOKEN != nRet )
1412 bReadNextChar = true;
1413 eState = SvParserState::Working;
1417 } while( nRet == HtmlTokenId::NONE && SvParserState::Working == eState );
1419 if( SvParserState::Pending == eState )
1420 nRet = HtmlTokenId::INVALID; // s.th. invalid
1422 return nRet;
1425 void HTMLParser::UnescapeToken()
1427 sal_Int32 nPos=0;
1429 bool bEscape = false;
1430 while( nPos < aToken.getLength() )
1432 bool bOldEscape = bEscape;
1433 bEscape = false;
1434 if( '\\'==aToken[nPos] && !bOldEscape )
1436 aToken.remove( nPos, 1 );
1437 bEscape = true;
1439 else
1441 nPos++;
1446 const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken )
1448 // If the options for the current token have already been returned,
1449 // return them once again.
1450 if (!maOptions.empty())
1451 return maOptions;
1453 sal_Int32 nPos = 0;
1454 while( nPos < aToken.getLength() )
1456 // A letter? Option beginning here.
1457 if( rtl::isAsciiAlpha( aToken[nPos] ) )
1459 HtmlOptionId nToken;
1460 OUString aValue;
1461 sal_Int32 nStt = nPos;
1462 sal_Unicode cChar = 0;
1464 // Actually only certain characters allowed.
1465 // Netscape only looks for "=" and white space (c.f.
1466 // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
1467 while( nPos < aToken.getLength() )
1469 cChar = aToken[nPos];
1470 if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) )
1471 break;
1472 nPos++;
1475 OUString sName( aToken.subView( nStt, nPos-nStt ) );
1477 // PlugIns require original token name. Convert to lower case only for searching.
1478 nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
1479 SAL_WARN_IF( nToken==HtmlOptionId::UNKNOWN, "svtools",
1480 "GetOption: unknown HTML option '" << sName << "'" );
1481 bool bStripCRLF = (nToken < HtmlOptionId::SCRIPT_START ||
1482 nToken >= HtmlOptionId::SCRIPT_END) &&
1483 (!pNoConvertToken || nToken != *pNoConvertToken);
1485 while( nPos < aToken.getLength() )
1487 cChar = aToken[nPos];
1488 if ( HTML_ISPRINTABLE(cChar) && !rtl::isAsciiWhiteSpace(cChar) )
1489 break;
1490 nPos++;
1493 // Option with value?
1494 if( nPos!=aToken.getLength() && '='==cChar )
1496 nPos++;
1498 while( nPos < aToken.getLength() )
1500 cChar = aToken[nPos];
1501 if ( HTML_ISPRINTABLE(cChar) && ' ' != cChar && '\t' != cChar && '\r' != cChar && '\n' != cChar )
1502 break;
1503 nPos++;
1506 if( nPos != aToken.getLength() )
1508 sal_Int32 nLen = 0;
1509 nStt = nPos;
1510 if( ('"'==cChar) || '\''==cChar )
1512 sal_Unicode cEnd = cChar;
1513 nPos++; nStt++;
1514 bool bDone = false;
1515 bool bEscape = false;
1516 while( nPos < aToken.getLength() && !bDone )
1518 bool bOldEscape = bEscape;
1519 bEscape = false;
1520 cChar = aToken[nPos];
1521 switch( cChar )
1523 case '\r':
1524 case '\n':
1525 if( bStripCRLF )
1526 aToken.remove( nPos, 1 );
1527 else
1529 nPos++;
1530 nLen++;
1532 break;
1533 case '\\':
1534 if( bOldEscape )
1536 nPos++;
1537 nLen++;
1539 else
1541 aToken.remove( nPos, 1 );
1542 bEscape = true;
1544 break;
1545 case '"':
1546 case '\'':
1547 bDone = !bOldEscape && cChar==cEnd;
1548 if( !bDone )
1550 nPos++;
1551 nLen++;
1553 break;
1554 default:
1555 nPos++;
1556 nLen++;
1557 break;
1560 if( nPos!=aToken.getLength() )
1561 nPos++;
1563 else
1565 // More liberal than the standard: allow all printable characters
1566 bool bEscape = false;
1567 bool bDone = false;
1568 while( nPos < aToken.getLength() && !bDone )
1570 bool bOldEscape = bEscape;
1571 bEscape = false;
1572 sal_Unicode c = aToken[nPos];
1573 switch( c )
1575 case ' ':
1576 bDone = !bOldEscape;
1577 if( !bDone )
1579 nPos++;
1580 nLen++;
1582 break;
1584 case '\t':
1585 case '\r':
1586 case '\n':
1587 bDone = true;
1588 break;
1590 case '\\':
1591 if( bOldEscape )
1593 nPos++;
1594 nLen++;
1596 else
1598 aToken.remove( nPos, 1 );
1599 bEscape = true;
1601 break;
1603 default:
1604 if( HTML_ISPRINTABLE( c ) )
1606 nPos++;
1607 nLen++;
1609 else
1610 bDone = true;
1611 break;
1616 if( nLen )
1617 aValue = aToken.subView( nStt, nLen );
1621 // Token is known and can be saved
1622 maOptions.emplace_back(nToken, sName, aValue);
1625 else
1626 // Ignore white space and unexpected characters
1627 nPos++;
1630 return maOptions;
1633 HtmlTokenId HTMLParser::FilterPRE( HtmlTokenId nToken )
1635 switch( nToken )
1637 // in Netscape they only have impact in not empty paragraphs
1638 case HtmlTokenId::PARABREAK_ON:
1639 nToken = HtmlTokenId::LINEBREAK;
1640 [[fallthrough]];
1641 case HtmlTokenId::LINEBREAK:
1642 case HtmlTokenId::NEWPARA:
1643 nPre_LinePos = 0;
1644 if( bPre_IgnoreNewPara )
1645 nToken = HtmlTokenId::NONE;
1646 break;
1648 case HtmlTokenId::TABCHAR:
1650 sal_Int32 nSpaces = 8 - (nPre_LinePos % 8);
1651 DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
1652 if (aToken.getLength() < nSpaces)
1654 using comphelper::string::padToLength;
1655 OUStringBuffer aBuf(aToken);
1656 aToken = padToLength(aBuf, nSpaces, ' ');
1658 nPre_LinePos += nSpaces;
1659 nToken = HtmlTokenId::TEXTTOKEN;
1661 break;
1662 // Keep those
1663 case HtmlTokenId::TEXTTOKEN:
1664 nPre_LinePos += aToken.getLength();
1665 break;
1667 case HtmlTokenId::SELECT_ON:
1668 case HtmlTokenId::SELECT_OFF:
1669 case HtmlTokenId::BODY_ON:
1670 case HtmlTokenId::FORM_ON:
1671 case HtmlTokenId::FORM_OFF:
1672 case HtmlTokenId::INPUT:
1673 case HtmlTokenId::OPTION:
1674 case HtmlTokenId::TEXTAREA_ON:
1675 case HtmlTokenId::TEXTAREA_OFF:
1677 case HtmlTokenId::IMAGE:
1678 case HtmlTokenId::APPLET_ON:
1679 case HtmlTokenId::APPLET_OFF:
1680 case HtmlTokenId::PARAM:
1681 case HtmlTokenId::EMBED:
1683 case HtmlTokenId::HEAD1_ON:
1684 case HtmlTokenId::HEAD1_OFF:
1685 case HtmlTokenId::HEAD2_ON:
1686 case HtmlTokenId::HEAD2_OFF:
1687 case HtmlTokenId::HEAD3_ON:
1688 case HtmlTokenId::HEAD3_OFF:
1689 case HtmlTokenId::HEAD4_ON:
1690 case HtmlTokenId::HEAD4_OFF:
1691 case HtmlTokenId::HEAD5_ON:
1692 case HtmlTokenId::HEAD5_OFF:
1693 case HtmlTokenId::HEAD6_ON:
1694 case HtmlTokenId::HEAD6_OFF:
1695 case HtmlTokenId::BLOCKQUOTE_ON:
1696 case HtmlTokenId::BLOCKQUOTE_OFF:
1697 case HtmlTokenId::ADDRESS_ON:
1698 case HtmlTokenId::ADDRESS_OFF:
1699 case HtmlTokenId::HORZRULE:
1701 case HtmlTokenId::CENTER_ON:
1702 case HtmlTokenId::CENTER_OFF:
1703 case HtmlTokenId::DIVISION_ON:
1704 case HtmlTokenId::DIVISION_OFF:
1706 case HtmlTokenId::SCRIPT_ON:
1707 case HtmlTokenId::SCRIPT_OFF:
1708 case HtmlTokenId::RAWDATA:
1710 case HtmlTokenId::TABLE_ON:
1711 case HtmlTokenId::TABLE_OFF:
1712 case HtmlTokenId::CAPTION_ON:
1713 case HtmlTokenId::CAPTION_OFF:
1714 case HtmlTokenId::COLGROUP_ON:
1715 case HtmlTokenId::COLGROUP_OFF:
1716 case HtmlTokenId::COL_ON:
1717 case HtmlTokenId::COL_OFF:
1718 case HtmlTokenId::THEAD_ON:
1719 case HtmlTokenId::THEAD_OFF:
1720 case HtmlTokenId::TFOOT_ON:
1721 case HtmlTokenId::TFOOT_OFF:
1722 case HtmlTokenId::TBODY_ON:
1723 case HtmlTokenId::TBODY_OFF:
1724 case HtmlTokenId::TABLEROW_ON:
1725 case HtmlTokenId::TABLEROW_OFF:
1726 case HtmlTokenId::TABLEDATA_ON:
1727 case HtmlTokenId::TABLEDATA_OFF:
1728 case HtmlTokenId::TABLEHEADER_ON:
1729 case HtmlTokenId::TABLEHEADER_OFF:
1731 case HtmlTokenId::ANCHOR_ON:
1732 case HtmlTokenId::ANCHOR_OFF:
1733 case HtmlTokenId::BOLD_ON:
1734 case HtmlTokenId::BOLD_OFF:
1735 case HtmlTokenId::ITALIC_ON:
1736 case HtmlTokenId::ITALIC_OFF:
1737 case HtmlTokenId::STRIKE_ON:
1738 case HtmlTokenId::STRIKE_OFF:
1739 case HtmlTokenId::STRIKETHROUGH_ON:
1740 case HtmlTokenId::STRIKETHROUGH_OFF:
1741 case HtmlTokenId::UNDERLINE_ON:
1742 case HtmlTokenId::UNDERLINE_OFF:
1743 case HtmlTokenId::BASEFONT_ON:
1744 case HtmlTokenId::BASEFONT_OFF:
1745 case HtmlTokenId::FONT_ON:
1746 case HtmlTokenId::FONT_OFF:
1747 case HtmlTokenId::BLINK_ON:
1748 case HtmlTokenId::BLINK_OFF:
1749 case HtmlTokenId::SPAN_ON:
1750 case HtmlTokenId::SPAN_OFF:
1751 case HtmlTokenId::SUBSCRIPT_ON:
1752 case HtmlTokenId::SUBSCRIPT_OFF:
1753 case HtmlTokenId::SUPERSCRIPT_ON:
1754 case HtmlTokenId::SUPERSCRIPT_OFF:
1755 case HtmlTokenId::BIGPRINT_ON:
1756 case HtmlTokenId::BIGPRINT_OFF:
1757 case HtmlTokenId::SMALLPRINT_OFF:
1758 case HtmlTokenId::SMALLPRINT_ON:
1760 case HtmlTokenId::EMPHASIS_ON:
1761 case HtmlTokenId::EMPHASIS_OFF:
1762 case HtmlTokenId::CITATION_ON:
1763 case HtmlTokenId::CITATION_OFF:
1764 case HtmlTokenId::STRONG_ON:
1765 case HtmlTokenId::STRONG_OFF:
1766 case HtmlTokenId::CODE_ON:
1767 case HtmlTokenId::CODE_OFF:
1768 case HtmlTokenId::SAMPLE_ON:
1769 case HtmlTokenId::SAMPLE_OFF:
1770 case HtmlTokenId::KEYBOARD_ON:
1771 case HtmlTokenId::KEYBOARD_OFF:
1772 case HtmlTokenId::VARIABLE_ON:
1773 case HtmlTokenId::VARIABLE_OFF:
1774 case HtmlTokenId::DEFINSTANCE_ON:
1775 case HtmlTokenId::DEFINSTANCE_OFF:
1776 case HtmlTokenId::SHORTQUOTE_ON:
1777 case HtmlTokenId::SHORTQUOTE_OFF:
1778 case HtmlTokenId::LANGUAGE_ON:
1779 case HtmlTokenId::LANGUAGE_OFF:
1780 case HtmlTokenId::AUTHOR_ON:
1781 case HtmlTokenId::AUTHOR_OFF:
1782 case HtmlTokenId::PERSON_ON:
1783 case HtmlTokenId::PERSON_OFF:
1784 case HtmlTokenId::ACRONYM_ON:
1785 case HtmlTokenId::ACRONYM_OFF:
1786 case HtmlTokenId::ABBREVIATION_ON:
1787 case HtmlTokenId::ABBREVIATION_OFF:
1788 case HtmlTokenId::INSERTEDTEXT_ON:
1789 case HtmlTokenId::INSERTEDTEXT_OFF:
1790 case HtmlTokenId::DELETEDTEXT_ON:
1791 case HtmlTokenId::DELETEDTEXT_OFF:
1792 case HtmlTokenId::TELETYPE_ON:
1793 case HtmlTokenId::TELETYPE_OFF:
1795 break;
1797 // The remainder is treated as an unknown token.
1798 default:
1799 if( nToken != HtmlTokenId::NONE )
1801 nToken =
1802 ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1803 ? HtmlTokenId::UNKNOWNCONTROL_OFF
1804 : HtmlTokenId::UNKNOWNCONTROL_ON );
1806 break;
1809 bPre_IgnoreNewPara = false;
1811 return nToken;
1814 HtmlTokenId HTMLParser::FilterXMP( HtmlTokenId nToken )
1816 switch( nToken )
1818 case HtmlTokenId::NEWPARA:
1819 if( bPre_IgnoreNewPara )
1820 nToken = HtmlTokenId::NONE;
1821 [[fallthrough]];
1822 case HtmlTokenId::TEXTTOKEN:
1823 case HtmlTokenId::NONBREAKSPACE:
1824 case HtmlTokenId::SOFTHYPH:
1825 break; // kept
1827 default:
1828 if( nToken != HtmlTokenId::NONE )
1830 if( (nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken) )
1832 sSaveToken = "</" + sSaveToken;
1834 else
1835 sSaveToken = "<" + sSaveToken;
1836 if( !aToken.isEmpty() )
1838 UnescapeToken();
1839 sSaveToken += " ";
1840 aToken.insert(0, sSaveToken);
1842 else
1843 aToken = sSaveToken;
1844 aToken.append( ">" );
1845 nToken = HtmlTokenId::TEXTTOKEN;
1847 break;
1850 bPre_IgnoreNewPara = false;
1852 return nToken;
1855 HtmlTokenId HTMLParser::FilterListing( HtmlTokenId nToken )
1857 switch( nToken )
1859 case HtmlTokenId::NEWPARA:
1860 if( bPre_IgnoreNewPara )
1861 nToken = HtmlTokenId::NONE;
1862 [[fallthrough]];
1863 case HtmlTokenId::TEXTTOKEN:
1864 case HtmlTokenId::NONBREAKSPACE:
1865 case HtmlTokenId::SOFTHYPH:
1866 break; // kept
1868 default:
1869 if( nToken != HtmlTokenId::NONE )
1871 nToken =
1872 ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1873 ? HtmlTokenId::UNKNOWNCONTROL_OFF
1874 : HtmlTokenId::UNKNOWNCONTROL_ON );
1876 break;
1879 bPre_IgnoreNewPara = false;
1881 return nToken;
1884 bool HTMLParser::InternalImgToPrivateURL( OUString& rURL )
1886 bool bFound = false;
1888 if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) )
1890 OUString aName( rURL.copy(14) );
1891 switch( aName[0] )
1893 case 'b':
1894 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata;
1895 break;
1896 case 'd':
1897 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed;
1898 break;
1899 case 'e':
1900 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed;
1901 break;
1902 case 'i':
1903 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure;
1904 break;
1905 case 'n':
1906 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound;
1907 break;
1910 if( bFound )
1912 OUString sTmp ( rURL );
1913 rURL = OOO_STRING_SVTOOLS_HTML_private_image;
1914 rURL += sTmp;
1917 return bFound;
1920 namespace {
1922 enum class HtmlMeta {
1923 NONE = 0,
1924 Author,
1925 Description,
1926 Keywords,
1927 Refresh,
1928 Classification,
1929 Created,
1930 ChangedBy,
1931 Changed,
1932 Generator,
1933 SDFootnote,
1934 SDEndnote,
1935 ContentType
1940 // <META NAME=xxx>
1941 HTMLOptionEnum<HtmlMeta> const aHTMLMetaNameTable[] =
1943 { OOO_STRING_SVTOOLS_HTML_META_author, HtmlMeta::Author },
1944 { OOO_STRING_SVTOOLS_HTML_META_changed, HtmlMeta::Changed },
1945 { OOO_STRING_SVTOOLS_HTML_META_changedby, HtmlMeta::ChangedBy },
1946 { OOO_STRING_SVTOOLS_HTML_META_classification,HtmlMeta::Classification},
1947 { OOO_STRING_SVTOOLS_HTML_META_content_type, HtmlMeta::ContentType },
1948 { OOO_STRING_SVTOOLS_HTML_META_created, HtmlMeta::Created },
1949 { OOO_STRING_SVTOOLS_HTML_META_description, HtmlMeta::Description },
1950 { OOO_STRING_SVTOOLS_HTML_META_keywords, HtmlMeta::Keywords },
1951 { OOO_STRING_SVTOOLS_HTML_META_generator, HtmlMeta::Generator },
1952 { OOO_STRING_SVTOOLS_HTML_META_refresh, HtmlMeta::Refresh },
1953 { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HtmlMeta::SDEndnote },
1954 { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HtmlMeta::SDFootnote },
1955 { nullptr, HtmlMeta(0) }
1959 void HTMLParser::AddMetaUserDefined( OUString const & )
1963 bool HTMLParser::ParseMetaOptionsImpl(
1964 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
1965 SvKeyValueIterator *i_pHTTPHeader,
1966 const HTMLOptions& aOptions,
1967 rtl_TextEncoding& o_rEnc )
1969 OUString aName, aContent;
1970 HtmlMeta nAction = HtmlMeta::NONE;
1971 bool bHTTPEquiv = false, bChanged = false;
1973 for ( size_t i = aOptions.size(); i; )
1975 const HTMLOption& aOption = aOptions[--i];
1976 switch ( aOption.GetToken() )
1978 case HtmlOptionId::NAME:
1979 aName = aOption.GetString();
1980 if ( HtmlMeta::NONE==nAction )
1982 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1984 break;
1985 case HtmlOptionId::HTTPEQUIV:
1986 aName = aOption.GetString();
1987 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1988 bHTTPEquiv = true;
1989 break;
1990 case HtmlOptionId::CONTENT:
1991 aContent = aOption.GetString();
1992 break;
1993 case HtmlOptionId::CHARSET:
1995 OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
1996 o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr()));
1997 break;
1999 default: break;
2003 if ( bHTTPEquiv || HtmlMeta::Description != nAction )
2005 // if it is not a Description, remove CRs and LFs from CONTENT
2006 aContent = aContent.replaceAll("\r", "").replaceAll("\n", "");
2008 else
2010 // convert line endings for Description
2011 aContent = convertLineEnd(aContent, GetSystemLineEnd());
2014 if ( bHTTPEquiv && i_pHTTPHeader )
2016 // Netscape seems to just ignore a closing ", so we do too
2017 if ( aContent.endsWith("\"") )
2019 aContent = aContent.copy( 0, aContent.getLength() - 1 );
2021 SvKeyValue aKeyValue( aName, aContent );
2022 i_pHTTPHeader->Append( aKeyValue );
2025 switch ( nAction )
2027 case HtmlMeta::Author:
2028 if (i_xDocProps.is()) {
2029 i_xDocProps->setAuthor( aContent );
2030 bChanged = true;
2032 break;
2033 case HtmlMeta::Description:
2034 if (i_xDocProps.is()) {
2035 i_xDocProps->setDescription( aContent );
2036 bChanged = true;
2038 break;
2039 case HtmlMeta::Keywords:
2040 if (i_xDocProps.is()) {
2041 i_xDocProps->setKeywords(
2042 ::comphelper::string::convertCommaSeparated(aContent));
2043 bChanged = true;
2045 break;
2046 case HtmlMeta::Classification:
2047 if (i_xDocProps.is()) {
2048 i_xDocProps->setSubject( aContent );
2049 bChanged = true;
2051 break;
2053 case HtmlMeta::ChangedBy:
2054 if (i_xDocProps.is()) {
2055 i_xDocProps->setModifiedBy( aContent );
2056 bChanged = true;
2058 break;
2060 case HtmlMeta::Created:
2061 case HtmlMeta::Changed:
2062 if (i_xDocProps.is() && !aContent.isEmpty())
2064 ::util::DateTime uDT;
2065 bool valid = false;
2066 if (comphelper::string::getTokenCount(aContent, ';') == 2)
2068 sal_Int32 nIdx{ 0 };
2069 Date aDate(o3tl::toInt32(o3tl::getToken(aContent, 0, ';', nIdx)));
2070 auto nTime = o3tl::toInt64(o3tl::getToken(aContent, 0, ';', nIdx));
2071 if (nTime < 0)
2072 nTime = o3tl::saturating_toggle_sign(nTime);
2073 tools::Time aTime(nTime);
2074 DateTime aDateTime(aDate, aTime);
2075 uDT = aDateTime.GetUNODateTime();
2076 valid = true;
2078 else if (utl::ISO8601parseDateTime(aContent, uDT))
2079 valid = true;
2081 if (valid)
2083 bChanged = true;
2084 if (HtmlMeta::Created == nAction)
2085 i_xDocProps->setCreationDate(uDT);
2086 else
2087 i_xDocProps->setModificationDate(uDT);
2090 break;
2092 case HtmlMeta::Refresh:
2093 DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, "Lost Reload-URL because of omitted MUST change." );
2094 break;
2096 case HtmlMeta::ContentType:
2097 if ( !aContent.isEmpty() )
2099 o_rEnc = GetEncodingByMIME( aContent );
2101 break;
2103 case HtmlMeta::NONE:
2104 if ( !bHTTPEquiv )
2106 if (i_xDocProps.is())
2108 uno::Reference<beans::XPropertyContainer> xUDProps
2109 = i_xDocProps->getUserDefinedProperties();
2110 try {
2111 xUDProps->addProperty(aName,
2112 beans::PropertyAttribute::REMOVABLE,
2113 uno::Any(aContent));
2114 AddMetaUserDefined(aName);
2115 bChanged = true;
2116 } catch (uno::Exception &) {
2117 // ignore
2121 break;
2122 default:
2123 break;
2126 return bChanged;
2129 bool HTMLParser::ParseMetaOptions(
2130 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2131 SvKeyValueIterator *i_pHeader )
2133 HtmlOptionId nContentOption = HtmlOptionId::CONTENT;
2134 rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2136 bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2137 GetOptions(&nContentOption),
2138 eEnc );
2140 // If the encoding is set by a META tag, it may only overwrite the
2141 // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2142 // encodings. Everything else cannot lead to reasonable results.
2143 if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2144 rtl_isOctetTextEncoding( eEnc ) &&
2145 rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2147 eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
2148 SetSrcEncoding( eEnc );
2151 return bRet;
2154 rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
2156 OUString sType;
2157 OUString sSubType;
2158 INetContentTypeParameterList aParameters;
2159 if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
2161 auto const iter = aParameters.find("charset");
2162 if (iter != aParameters.end())
2164 const INetContentTypeParameter * pCharset = &iter->second;
2165 OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
2166 return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
2169 return RTL_TEXTENCODING_DONTKNOW;
2172 rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2174 rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2175 if( pHTTPHeader )
2177 SvKeyValue aKV;
2178 for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2179 bCont = pHTTPHeader->GetNext( aKV ) )
2181 if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2183 if( !aKV.GetValue().isEmpty() )
2185 eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2190 return eRet;
2193 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
2195 bool bRet = false;
2196 rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2197 if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2199 SetSrcEncoding( eEnc );
2200 bRet = true;
2202 return bRet;
2206 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */