svtools/source/svhtml/parhtml.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  * This file incorporates work covered by the following license notice:
  10  *
  11  *   Licensed to the Apache Software Foundation (ASF) under one or more
  12  *   contributor license agreements. See the NOTICE file distributed
  13  *   with this work for additional information regarding copyright
  14  *   ownership. The ASF licenses this file to you under the Apache
  15  *   License, Version 2.0 (the "License"); you may not use this file
  16  *   except in compliance with the License. You may obtain a copy of
  17  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  18  */
  19
  20 #include <comphelper/string.hxx>
  21 #include <o3tl/safeint.hxx>
  22 #include <o3tl/string_view.hxx>
  23 #include <tools/stream.hxx>
  24 #include <tools/debug.hxx>
  25 #include <tools/color.hxx>
  26 #include <rtl/ustrbuf.hxx>
  27 #include <rtl/character.hxx>
  28 #include <rtl/tencinfo.h>
  29 #include <sal/log.hxx>
  30 #include <tools/tenccvt.hxx>
  31 #include <tools/datetime.hxx>
  32 #include <unotools/datetime.hxx>
  33 #include <svl/inettype.hxx>
  34 #include <svl/lngmisc.hxx>
  35 #include <com/sun/star/beans/PropertyAttribute.hpp>
  36 #include <com/sun/star/document/XDocumentProperties.hpp>
  37
  38 #include <svtools/parhtml.hxx>
  39 #include <svtools/htmltokn.h>
  40 #include <svtools/htmlkywd.hxx>
  41
  42 #include <utility>
  43
  44 using namespace ::com::sun::star;
  45
  46
  47 const sal_Int32 MAX_LEN( 1024 );
  48
  49 const sal_Int32 MAX_ENTITY_LEN( 8 );
  50
  51
  52 // Tables to convert option values into strings
  53
  54 // <INPUT TYPE=xxx>
  55 HTMLOptionEnum<HTMLInputType> const aInputTypeOptEnums[] =
  56 {
  57     { OOO_STRING_SVTOOLS_HTML_IT_text,      HTMLInputType::Text        },
  58     { OOO_STRING_SVTOOLS_HTML_IT_password,  HTMLInputType::Password    },
  59     { OOO_STRING_SVTOOLS_HTML_IT_checkbox,  HTMLInputType::Checkbox    },
  60     { OOO_STRING_SVTOOLS_HTML_IT_radio,     HTMLInputType::Radio       },
  61     { OOO_STRING_SVTOOLS_HTML_IT_range,     HTMLInputType::Range       },
  62     { OOO_STRING_SVTOOLS_HTML_IT_scribble,  HTMLInputType::Scribble    },
  63     { OOO_STRING_SVTOOLS_HTML_IT_file,      HTMLInputType::File        },
  64     { OOO_STRING_SVTOOLS_HTML_IT_hidden,    HTMLInputType::Hidden      },
  65     { OOO_STRING_SVTOOLS_HTML_IT_submit,    HTMLInputType::Submit      },
  66     { OOO_STRING_SVTOOLS_HTML_IT_image,     HTMLInputType::Image       },
  67     { OOO_STRING_SVTOOLS_HTML_IT_reset,     HTMLInputType::Reset       },
  68     { OOO_STRING_SVTOOLS_HTML_IT_button,    HTMLInputType::Button      },
  69     { nullptr,                              HTMLInputType(0)    }
  70 };
  71
  72 // <TABLE FRAME=xxx>
  73 HTMLOptionEnum<HTMLTableFrame> const aTableFrameOptEnums[] =
  74 {
  75     { OOO_STRING_SVTOOLS_HTML_TF_void,    HTMLTableFrame::Void    },
  76     { OOO_STRING_SVTOOLS_HTML_TF_above,   HTMLTableFrame::Above   },
  77     { OOO_STRING_SVTOOLS_HTML_TF_below,   HTMLTableFrame::Below   },
  78     { OOO_STRING_SVTOOLS_HTML_TF_hsides,  HTMLTableFrame::HSides  },
  79     { OOO_STRING_SVTOOLS_HTML_TF_lhs,     HTMLTableFrame::LHS     },
  80     { OOO_STRING_SVTOOLS_HTML_TF_rhs,     HTMLTableFrame::RHS     },
  81     { OOO_STRING_SVTOOLS_HTML_TF_vsides,  HTMLTableFrame::VSides  },
  82     { OOO_STRING_SVTOOLS_HTML_TF_box,     HTMLTableFrame::Box     },
  83     { OOO_STRING_SVTOOLS_HTML_TF_border,  HTMLTableFrame::Box     },
  84     { nullptr,                            HTMLTableFrame(0) }
  85 };
  86
  87 // <TABLE RULES=xxx>
  88 HTMLOptionEnum<HTMLTableRules> const aTableRulesOptEnums[] =
  89 {
  90     { OOO_STRING_SVTOOLS_HTML_TR_none,   HTMLTableRules::NONE      },
  91     { OOO_STRING_SVTOOLS_HTML_TR_groups, HTMLTableRules::Groups    },
  92     { OOO_STRING_SVTOOLS_HTML_TR_rows,   HTMLTableRules::Rows      },
  93     { OOO_STRING_SVTOOLS_HTML_TR_cols,   HTMLTableRules::Cols      },
  94     { OOO_STRING_SVTOOLS_HTML_TR_all,    HTMLTableRules::All       },
  95     { nullptr,                           HTMLTableRules(0) }
  96 };
  97
  98
  99 HTMLOption::HTMLOption( HtmlOptionId nTok, OUString _aToken,
 100                         OUString _aValue )
 101     : aValue(std::move(_aValue))
 102     , aToken(std::move(_aToken))
 103     , nToken( nTok )
 104 {
 105     DBG_ASSERT( nToken>=HtmlOptionId::BOOL_START && nToken<HtmlOptionId::END,
 106         "HTMLOption: unknown token" );
 107 }
 108
 109 sal_uInt32 HTMLOption::GetNumber() const
 110 {
 111     DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START &&
 112                  nToken<HtmlOptionId::NUMBER_END) ||
 113                 (nToken>=HtmlOptionId::CONTEXT_START &&
 114                  nToken<HtmlOptionId::CONTEXT_END) ||
 115                 nToken==HtmlOptionId::VALUE,
 116         "GetNumber: Option not numerical" );
 117     OUString aTmp(comphelper::string::stripStart(aValue, ' '));
 118     sal_Int32 nTmp = aTmp.toInt32();
 119     return nTmp >= 0 ? static_cast<sal_uInt32>(nTmp) : 0;
 120 }
 121
 122 sal_Int32 HTMLOption::GetSNumber() const
 123 {
 124     DBG_ASSERT( (nToken>=HtmlOptionId::NUMBER_START && nToken<HtmlOptionId::NUMBER_END) ||
 125                 (nToken>=HtmlOptionId::CONTEXT_START && nToken<HtmlOptionId::CONTEXT_END),
 126         "GetSNumber: Option not numerical" );
 127     OUString aTmp(comphelper::string::stripStart(aValue, ' '));
 128     return aTmp.toInt32();
 129 }
 130
 131 void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers ) const
 132 {
 133     rNumbers.clear();
 134
 135     // This is a very simplified scanner: it only searches all
 136     // numerals in the string.
 137     bool bInNum = false;
 138     sal_uInt32 nNum = 0;
 139     for( sal_Int32 i=0; i<aValue.getLength(); i++ )
 140     {
 141         sal_Unicode c = aValue[ i ];
 142         if( c>='0' && c<='9' )
 143         {
 144             nNum *= 10;
 145             nNum += (c - '0');
 146             bInNum = true;
 147         }
 148         else if( bInNum )
 149         {
 150             rNumbers.push_back( nNum );
 151             bInNum = false;
 152             nNum = 0;
 153         }
 154     }
 155     if( bInNum )
 156     {
 157         rNumbers.push_back( nNum );
 158     }
 159 }
 160
 161 void HTMLOption::GetColor( Color& rColor ) const
 162 {
 163     DBG_ASSERT( (nToken>=HtmlOptionId::COLOR_START && nToken<HtmlOptionId::COLOR_END) || nToken==HtmlOptionId::SIZE,
 164         "GetColor: Option is not a color." );
 165
 166     OUString aTmp(aValue.toAsciiLowerCase());
 167     sal_uInt32 nColor = SAL_MAX_UINT32;
 168     if (!aTmp.isEmpty() && aTmp[0] != '#')
 169         nColor = GetHTMLColor(aTmp);
 170
 171     if( SAL_MAX_UINT32 == nColor )
 172     {
 173         nColor = 0;
 174         sal_Int32 nPos = 0;
 175         for (sal_uInt32 i=0; i<6; ++i)
 176         {
 177             // Whatever Netscape does to get color values,
 178             // at maximum three characters < '0' are ignored.
 179             sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
 180             if( c < '0' )
 181             {
 182                 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
 183                 if( c < '0' )
 184                     c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
 185             }
 186             nColor *= 16;
 187             if( c >= '0' && c <= '9' )
 188                 nColor += (c - '0');
 189             else if( c >= 'a' && c <= 'f' )
 190                 nColor += (c + 0xa - 'a');
 191         }
 192     }
 193
 194     rColor.SetRed(   static_cast<sal_uInt8>((nColor & 0x00ff0000) >> 16) );
 195     rColor.SetGreen( static_cast<sal_uInt8>((nColor & 0x0000ff00) >> 8));
 196     rColor.SetBlue(  static_cast<sal_uInt8>(nColor & 0x000000ff) );
 197 }
 198
 199 HTMLInputType HTMLOption::GetInputType() const
 200 {
 201     DBG_ASSERT( nToken==HtmlOptionId::TYPE, "GetInputType: Option not TYPE" );
 202     return GetEnum( aInputTypeOptEnums, HTMLInputType::Text );
 203 }
 204
 205 HTMLTableFrame HTMLOption::GetTableFrame() const
 206 {
 207     DBG_ASSERT( nToken==HtmlOptionId::FRAME, "GetTableFrame: Option not FRAME" );
 208     return GetEnum( aTableFrameOptEnums );
 209 }
 210
 211 HTMLTableRules HTMLOption::GetTableRules() const
 212 {
 213     DBG_ASSERT( nToken==HtmlOptionId::RULES, "GetTableRules: Option not RULES" );
 214     return GetEnum( aTableRulesOptEnums );
 215 }
 216
 217 HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
 218     SvParser<HtmlTokenId>( rIn ),
 219     bNewDoc(bReadNewDoc),
 220     bIsInHeader(true),
 221     bReadListing(false),
 222     bReadXMP(false),
 223     bReadPRE(false),
 224     bReadTextArea(false),
 225     bReadScript(false),
 226     bReadStyle(false),
 227     bEndTokenFound(false),
 228     bPre_IgnoreNewPara(false),
 229     bReadNextChar(false),
 230     bReadComment(false),
 231     nPre_LinePos(0),
 232     mnPendingOffToken(HtmlTokenId::NONE)
 233 {
 234     //#i76649, default to UTF-8 for HTML unless we know differently
 235     SetSrcEncoding(RTL_TEXTENCODING_UTF8);
 236 }
 237
 238 HTMLParser::~HTMLParser()
 239 {
 240 }
 241
 242 void HTMLParser::SetNamespace(std::u16string_view rNamespace)
 243 {
 244     // Convert namespace alias to a prefix.
 245     maNamespace = OUString::Concat(rNamespace) + ":";
 246 }
 247
 248 namespace
 249 {
 250     class RefGuard
 251     {
 252     private:
 253         HTMLParser& m_rParser;
 254     public:
 255         RefGuard(HTMLParser& rParser)
 256             : m_rParser(rParser)
 257         {
 258             m_rParser.AddFirstRef();
 259         }
 260
 261         ~RefGuard()
 262         {
 263             if (m_rParser.GetStatus() != SvParserState::Pending)
 264                 m_rParser.ReleaseRef(); // Parser not needed anymore
 265         }
 266     };
 267 }
 268
 269 SvParserState HTMLParser::CallParser()
 270 {
 271     eState = SvParserState::Working;
 272     nNextCh = GetNextChar();
 273     SaveState( HtmlTokenId::NONE );
 274
 275     nPre_LinePos = 0;
 276     bPre_IgnoreNewPara = false;
 277
 278     RefGuard aRefGuard(*this);
 279
 280     Continue( HtmlTokenId::NONE );
 281
 282     return eState;
 283 }
 284
 285 void HTMLParser::Continue( HtmlTokenId nToken )
 286 {
 287     if( nToken == HtmlTokenId::NONE )
 288         nToken = GetNextToken();
 289
 290     while( IsParserWorking() )
 291     {
 292         SaveState( nToken );
 293         nToken = FilterToken( nToken );
 294
 295         if( nToken != HtmlTokenId::NONE )
 296             NextToken( nToken );
 297
 298         if( IsParserWorking() )
 299             SaveState( HtmlTokenId::NONE );         // continue with new token
 300
 301         nToken = GetNextToken();
 302     }
 303 }
 304
 305 HtmlTokenId HTMLParser::FilterToken( HtmlTokenId nToken )
 306 {
 307     switch( nToken )
 308     {
 309     case HtmlTokenId(EOF):
 310         nToken = HtmlTokenId::NONE;
 311         break;          // don't pass
 312
 313     case HtmlTokenId::HEAD_OFF:
 314         bIsInHeader = false;
 315         break;
 316
 317     case HtmlTokenId::HEAD_ON:
 318         bIsInHeader = true;
 319         break;
 320
 321     case HtmlTokenId::BODY_ON:
 322         bIsInHeader = false;
 323         break;
 324
 325     case HtmlTokenId::FRAMESET_ON:
 326         bIsInHeader = false;
 327         break;
 328
 329     case HtmlTokenId::BODY_OFF:
 330         bReadPRE = bReadListing = bReadXMP = false;
 331         break;
 332
 333     case HtmlTokenId::HTML_OFF:
 334         nToken = HtmlTokenId::NONE;
 335         bReadPRE = bReadListing = bReadXMP = false;
 336         break;      // HtmlTokenId::ON hasn't been passed either !
 337
 338     case HtmlTokenId::PREFORMTXT_ON:
 339         StartPRE();
 340         break;
 341
 342     case HtmlTokenId::PREFORMTXT_OFF:
 343         FinishPRE();
 344         break;
 345
 346     case HtmlTokenId::LISTING_ON:
 347         StartListing();
 348         break;
 349
 350     case HtmlTokenId::LISTING_OFF:
 351         FinishListing();
 352         break;
 353
 354     case HtmlTokenId::XMP_ON:
 355         StartXMP();
 356         break;
 357
 358     case HtmlTokenId::XMP_OFF:
 359         FinishXMP();
 360         break;
 361
 362     default:
 363         if( bReadPRE )
 364             nToken = FilterPRE( nToken );
 365         else if( bReadListing )
 366             nToken = FilterListing( nToken );
 367         else if( bReadXMP )
 368             nToken = FilterXMP( nToken );
 369
 370         break;
 371     }
 372
 373     return nToken;
 374 }
 375
 376 namespace {
 377
 378 constexpr bool HTML_ISPRINTABLE(sal_Unicode c) { return c >= 32 && c != 127; }
 379
 380 constexpr bool HTML_ISSPACE(sal_uInt32 c)
 381 {
 382     return ' ' == c || '\t' == c || '\r' == c || '\n' == c || '\x0b' == c;
 383 }
 384
 385 }
 386
 387 HtmlTokenId HTMLParser::ScanText(const sal_Unicode cBreak)
 388 {
 389     OUStringBuffer sTmpBuffer( MAX_LEN );
 390     bool bContinue = true;
 391     bool bEqSignFound = false;
 392     sal_uInt32  cQuote = 0U;
 393
 394     while( bContinue && IsParserWorking() )
 395     {
 396         bool bNextCh = true;
 397         switch( nNextCh )
 398         {
 399         case '&':
 400             bEqSignFound = false;
 401             if( bReadXMP )
 402                 sTmpBuffer.append( '&' );
 403             else
 404             {
 405                 sal_uInt64 nStreamPos = rInput.Tell();
 406                 sal_uInt32 nLinePos = GetLinePos();
 407
 408                 sal_uInt32 cChar = 0U;
 409                 if( '#' == (nNextCh = GetNextChar()) )
 410                 {
 411                     nNextCh = GetNextChar();
 412                     const bool bIsHex( 'x' == nNextCh );
 413                     const bool bIsDecOrHex( bIsHex || rtl::isAsciiDigit(nNextCh) );
 414                     if ( bIsDecOrHex )
 415                     {
 416                         if ( bIsHex )
 417                         {
 418                             nNextCh = GetNextChar();
 419                             while ( rtl::isAsciiHexDigit(nNextCh) )
 420                             {
 421                                 cChar = cChar * 16U +
 422                                         ( nNextCh <= '9'
 423                                           ? sal_uInt32( nNextCh - '0' )
 424                                           : ( nNextCh <= 'F'
 425                                               ? sal_uInt32( nNextCh - 'A' + 10 )
 426                                               : sal_uInt32( nNextCh - 'a' + 10 ) ) );
 427                                 nNextCh = GetNextChar();
 428                             }
 429                         }
 430                         else
 431                         {
 432                             do
 433                             {
 434                                 cChar = cChar * 10U + sal_uInt32( nNextCh - '0');
 435                                 nNextCh = GetNextChar();
 436                             }
 437                             while( rtl::isAsciiDigit(nNextCh) );
 438                         }
 439
 440                         if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
 441                             RTL_TEXTENCODING_UCS2 != eSrcEnc &&
 442                             RTL_TEXTENCODING_UTF8 != eSrcEnc &&
 443                             cChar < 256 )
 444                         {
 445                             const sal_uInt32 convertFlags =
 446                                 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
 447                                 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
 448                                 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
 449
 450                             char cEncodedChar = static_cast<char>(cChar);
 451                             cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
 452                             if( 0U == cChar )
 453                             {
 454                                 // If the character could not be
 455                                 // converted, because a conversion is not
 456                                 // available, do no conversion at all.
 457                                 cChar = cEncodedChar;
 458                             }
 459                         }
 460                     }
 461                     else
 462                         nNextCh = 0U;
 463
 464                     if (!rtl::isUnicodeCodePoint(cChar)
 465                         || (linguistic::IsControlChar(cChar)
 466                             && cChar != '\r' && cChar != '\n' && cChar != '\t'))
 467                     {
 468                         cChar = '?';
 469                     }
 470                 }
 471                 else if( rtl::isAsciiAlpha( nNextCh ) )
 472                 {
 473                     OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
 474                     sal_Int32 nPos = 0;
 475                     do
 476                     {
 477                         sEntityBuffer.appendUtf32( nNextCh );
 478                         nPos++;
 479                         nNextCh = GetNextChar();
 480                     }
 481                     while( nPos < MAX_ENTITY_LEN && rtl::isAsciiAlphanumeric( nNextCh ) &&
 482                            !rInput.eof() );
 483
 484                     if( IsParserWorking() && !rInput.eof() )
 485                     {
 486                         std::u16string_view sEntity(sEntityBuffer.subView(0, nPos));
 487                         cChar = GetHTMLCharName( sEntity );
 488
 489                         // not found ( == 0 ): plain text
 490                         // or a character which is inserted as attribute
 491                         if( 0U == cChar && ';' != nNextCh )
 492                         {
 493                             DBG_ASSERT( rInput.Tell() - nStreamPos ==
 494                                         static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
 495                                         "UTF-8 is failing here" );
 496                             for( sal_Int32 i = nPos-1; i>1; i-- )
 497                             {
 498                                 nNextCh = sEntityBuffer[i];
 499                                 sEntityBuffer.setLength( i );
 500                                 sEntity = sEntityBuffer.subView(0, i);
 501                                 cChar = GetHTMLCharName( sEntity );
 502                                 if( cChar )
 503                                 {
 504                                     rInput.SeekRel( -static_cast<sal_Int64>
 505                                             (nPos-i)*GetCharSize() );
 506                                     nlLinePos -= sal_uInt32(nPos-i);
 507                                     nPos = i;
 508                                     ClearTxtConvContext();
 509                                     break;
 510                                 }
 511                             }
 512                         }
 513
 514                         if( !cChar )        // unknown character?
 515                         {
 516                             // back in stream, insert '&'
 517                             // and restart with next character
 518                             sTmpBuffer.append( '&' );
 519
 520                             DBG_ASSERT( rInput.Tell()-nStreamPos ==
 521                                         static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
 522                                         "Wrong stream position" );
 523                             DBG_ASSERT( nlLinePos-nLinePos ==
 524                                         static_cast<sal_uInt32>(nPos+1),
 525                                         "Wrong line position" );
 526                             rInput.Seek( nStreamPos );
 527                             nlLinePos = nLinePos;
 528                             ClearTxtConvContext();
 529                             break;
 530                         }
 531
 532                         assert(cChar != 0);
 533
 534                         // 1 == Non Breaking Space
 535                         // 2 == SoftHyphen
 536
 537                         if (cChar == 1 || cChar == 2)
 538                         {
 539                             if( '>' == cBreak )
 540                             {
 541                                 // When reading the content of a tag we have
 542                                 // to change it to ' ' or '-'
 543                                 if( 1U == cChar )
 544                                     cChar = ' ';
 545                                 else //2U
 546                                     cChar = '-';
 547                             }
 548                             else
 549                             {
 550                                 // If not scanning a tag return token
 551                                 aToken.append( sTmpBuffer );
 552                                 sTmpBuffer.setLength(0);
 553
 554                                 if( !aToken.isEmpty() )
 555                                 {
 556                                     // restart with character
 557                                     nNextCh = '&';
 558                                     DBG_ASSERT( rInput.Tell()-nStreamPos ==
 559                                                 static_cast<sal_uInt64>(nPos+1)*GetCharSize(),
 560                                                 "Wrong stream position" );
 561                                     DBG_ASSERT( nlLinePos-nLinePos ==
 562                                                 static_cast<sal_uInt32>(nPos+1),
 563                                                 "Wrong line position" );
 564                                     rInput.Seek( nStreamPos );
 565                                     nlLinePos = nLinePos;
 566                                     ClearTxtConvContext();
 567                                     return HtmlTokenId::TEXTTOKEN;
 568                                 }
 569
 570                                 // Hack: _GetNextChar shall not read the
 571                                 // next character
 572                                 if( ';' != nNextCh )
 573                                     aToken.append( " " );
 574                                 if( 1U == cChar )
 575                                     return HtmlTokenId::NONBREAKSPACE;
 576                                 else //2U
 577                                     return HtmlTokenId::SOFTHYPH;
 578                             }
 579                         }
 580                     }
 581                     else
 582                         nNextCh = 0U;
 583                 }
 584                 // &{...};-JavaScript-Macros are not supported any longer.
 585                 else if( IsParserWorking() )
 586                 {
 587                     sTmpBuffer.append( '&' );
 588                     bNextCh = false;
 589                     break;
 590                 }
 591
 592                 bNextCh = (';' == nNextCh);
 593                 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
 594                                     cChar=='\"' || cChar==' ') )
 595                 {
 596                     // ' and " have to be escaped within tags to separate
 597                     // them from ' and " enclosing options.
 598                     // \ has to be escaped as well.
 599                     // Space is protected because it's not a delimiter between
 600                     // options.
 601                     sTmpBuffer.append( '\\' );
 602                 }
 603                 if( IsParserWorking() )
 604                 {
 605                     if( cChar )
 606                         sTmpBuffer.appendUtf32( cChar );
 607                 }
 608                 else if( SvParserState::Pending==eState && '>'!=cBreak )
 609                 {
 610                     // Restart with '&', the remainder is returned as
 611                     // text token.
 612                     if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
 613                     {
 614                         // _GetNextChar() returns the previous text and
 615                         // during the next execution a new character is read.
 616                         // Thus we have to position in front of the '&'.
 617                         nNextCh = 0U;
 618                         rInput.Seek( nStreamPos - GetCharSize() );
 619                         nlLinePos = nLinePos-1;
 620                         ClearTxtConvContext();
 621                         bReadNextChar = true;
 622                     }
 623                     bNextCh = false;
 624                 }
 625             }
 626             break;
 627         case '=':
 628             if( '>'==cBreak && !cQuote )
 629                 bEqSignFound = true;
 630             sTmpBuffer.appendUtf32( nNextCh );
 631             break;
 632
 633         case '\\':
 634             if( '>'==cBreak )
 635             {
 636                 // mark within tags
 637                 sTmpBuffer.append( '\\' );
 638             }
 639             sTmpBuffer.append( '\\' );
 640             break;
 641
 642         case '\"':
 643         case '\'':
 644             if( '>'==cBreak )
 645             {
 646                 if( bEqSignFound )
 647                     cQuote = nNextCh;
 648                 else if( cQuote && (cQuote==nNextCh ) )
 649                     cQuote = 0U;
 650             }
 651             sTmpBuffer.appendUtf32( nNextCh );
 652             bEqSignFound = false;
 653             break;
 654
 655         case sal_Unicode(EOF):
 656             if( rInput.eof() )
 657             {
 658                 bContinue = false;
 659             }
 660             // else: ignore, not a valid code point
 661             break;
 662
 663         case '<':
 664             bEqSignFound = false;
 665             if( '>'==cBreak )
 666                 sTmpBuffer.appendUtf32( nNextCh );
 667             else
 668                 bContinue = false;      // break, string is together
 669             break;
 670
 671         case '\f':
 672             if( '>' == cBreak )
 673             {
 674                 // If scanning options treat it like a space, ...
 675                 sTmpBuffer.append( ' ' );
 676             }
 677             else
 678             {
 679                 // otherwise it's a separate token.
 680                 bContinue = false;
 681             }
 682             break;
 683
 684         case '\r':
 685         case '\n':
 686             if( '>'==cBreak )
 687             {
 688                 // cr/lf in tag is handled in GetNextToken_()
 689                 sTmpBuffer.appendUtf32( nNextCh );
 690                 break;
 691             }
 692             else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
 693             {
 694                 bContinue = false;
 695                 break;
 696             }
 697             // Reduce sequence of CR/LF/BLANK/TAB to a single blank
 698             [[fallthrough]];
 699         case '\t':
 700             if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
 701             {
 702                 // Pass Tabs up in <PRE>
 703                 bContinue = false;
 704                 break;
 705             }
 706             [[fallthrough]];
 707         case '\x0b':
 708             if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
 709                 '>'!=cBreak )
 710             {
 711                 break;
 712             }
 713             if (!m_bPreserveSpaces)
 714                 nNextCh = ' ';
 715             [[fallthrough]];
 716         case ' ':
 717             if (!m_bPreserveSpaces)
 718             {
 719                 sTmpBuffer.appendUtf32(nNextCh);
 720                 if ('>' != cBreak && (!bReadListing && !bReadXMP && !bReadPRE && !bReadTextArea))
 721                 {
 722                     // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
 723                     do
 724                     {
 725                         nNextCh = GetNextChar();
 726                         if (sal_Unicode(EOF) == nNextCh && rInput.eof())
 727                         {
 728                             if (!aToken.isEmpty() || sTmpBuffer.getLength() > 1)
 729                             {
 730                                 // Have seen s.th. aside from blanks?
 731                                 aToken.append(sTmpBuffer);
 732                                 sTmpBuffer.setLength(0);
 733                                 return HtmlTokenId::TEXTTOKEN;
 734                             }
 735                             else
 736                                 // Only read blanks: no text must be returned
 737                                 // and GetNextToken_ has to read until EOF
 738                                 return HtmlTokenId::NONE;
 739                         }
 740                     } while (HTML_ISSPACE(nNextCh));
 741                     bNextCh = false;
 742                 }
 743                 break;
 744             }
 745             [[fallthrough]];
 746         default:
 747             bEqSignFound = false;
 748             if (nNextCh == cBreak && !cQuote)
 749                 bContinue = false;
 750             else
 751             {
 752                 do {
 753                     if (!linguistic::IsControlChar(nNextCh) || HTML_ISSPACE(nNextCh))
 754                     {
 755                     // All remaining characters make their way into the text.
 756                         sTmpBuffer.appendUtf32( nNextCh );
 757                     }
 758
 759                     nNextCh = GetNextChar();
 760                     if( ( sal_Unicode(EOF) == nNextCh && rInput.eof() ) ||
 761                         !IsParserWorking() )
 762                     {
 763                         if( !sTmpBuffer.isEmpty() )
 764                             aToken.append( sTmpBuffer );
 765                         return HtmlTokenId::TEXTTOKEN;
 766                     }
 767                 } while( rtl::isAsciiAlpha( nNextCh ) || rtl::isAsciiDigit( nNextCh ) );
 768                 bNextCh = false;
 769             }
 770         }
 771
 772         if( bContinue && bNextCh )
 773             nNextCh = GetNextChar();
 774     }
 775
 776     if( !sTmpBuffer.isEmpty() )
 777         aToken.append( sTmpBuffer );
 778
 779     return HtmlTokenId::TEXTTOKEN;
 780 }
 781
 782 HtmlTokenId HTMLParser::GetNextRawToken()
 783 {
 784     OUStringBuffer sTmpBuffer( MAX_LEN );
 785
 786     if( bEndTokenFound )
 787     {
 788         // During the last execution we already found the end token,
 789         // thus we don't have to search it again.
 790         bReadScript = false;
 791         bReadStyle = false;
 792         aEndToken.clear();
 793         bEndTokenFound = false;
 794
 795         return HtmlTokenId::NONE;
 796     }
 797
 798     // Default return value: HtmlTokenId::RAWDATA
 799     bool bContinue = true;
 800     HtmlTokenId nToken = HtmlTokenId::RAWDATA;
 801     SaveState( HtmlTokenId::NONE );
 802     while( bContinue && IsParserWorking() )
 803     {
 804         bool bNextCh = true;
 805         switch( nNextCh )
 806         {
 807         case '<':
 808             {
 809                 // Maybe we've reached the end.
 810
 811                 // Save what we have read previously...
 812                 aToken.append( sTmpBuffer );
 813                 sTmpBuffer.setLength(0);
 814
 815                 // and remember position in stream.
 816                 sal_uInt64 nStreamPos = rInput.Tell();
 817                 sal_uInt32 nLineNr = GetLineNr();
 818                 sal_uInt32 nLinePos = GetLinePos();
 819
 820                 // Start of an end token?
 821                 bool bOffState = false;
 822                 if( '/' == (nNextCh = GetNextChar()) )
 823                 {
 824                     bOffState = true;
 825                     nNextCh = GetNextChar();
 826                 }
 827                 else if( '!' == nNextCh )
 828                 {
 829                     sTmpBuffer.appendUtf32( nNextCh );
 830                     nNextCh = GetNextChar();
 831                 }
 832
 833                 // Read following letters
 834                 while( (rtl::isAsciiAlpha(nNextCh) || '-'==nNextCh) &&
 835                        IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
 836                 {
 837                     sTmpBuffer.appendUtf32( nNextCh );
 838                     nNextCh = GetNextChar();
 839                 }
 840
 841                 OUString aTok( sTmpBuffer.toString() );
 842                 aTok = aTok.toAsciiLowerCase();
 843                 bool bDone = false;
 844                 if( bReadScript || !aEndToken.isEmpty() )
 845                 {
 846                     if( !bReadComment )
 847                     {
 848                         if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) )
 849                         {
 850                             bReadComment = true;
 851                         }
 852                         else
 853                         {
 854                             // A script has to end with "</SCRIPT>". But
 855                             // ">" is optional for security reasons
 856                             bDone = bOffState &&
 857                             ( bReadScript
 858                                 ? aTok == OOO_STRING_SVTOOLS_HTML_script
 859                                 : aTok == aEndToken );
 860                         }
 861                     }
 862                     if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
 863                     {
 864                         // End of comment of style <!----->
 865                         bReadComment = false;
 866                     }
 867                 }
 868                 else
 869                 {
 870                     // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
 871                     if( bOffState )
 872                         bDone = aTok == OOO_STRING_SVTOOLS_HTML_style ||
 873                                 aTok == OOO_STRING_SVTOOLS_HTML_head;
 874                     else
 875                         bDone = aTok == OOO_STRING_SVTOOLS_HTML_body;
 876                 }
 877
 878                 if( bDone )
 879                 {
 880                     // Done! Return the previously read string (if requested)
 881                     // and continue.
 882
 883                     bContinue = false;
 884
 885                     // nToken==0 means, GetNextToken_ continues to read
 886                     if( aToken.isEmpty() && (bReadStyle || bReadScript) )
 887                     {
 888                         // Immediately close environment (or context?)
 889                         // and parse the end token
 890                         bReadScript = false;
 891                         bReadStyle = false;
 892                         aEndToken.clear();
 893                         nToken = HtmlTokenId::NONE;
 894                     }
 895                     else
 896                     {
 897                         // Keep bReadScript/bReadStyle alive
 898                         // and parse end token during next execution
 899                         bEndTokenFound = true;
 900                     }
 901
 902                     // Move backwards in stream to '<'
 903                     rInput.Seek( nStreamPos );
 904                     SetLineNr( nLineNr );
 905                     SetLinePos( nLinePos );
 906                     ClearTxtConvContext();
 907                     nNextCh = '<';
 908
 909                     // Don't append string to token.
 910                     sTmpBuffer.setLength( 0 );
 911                 }
 912                 else
 913                 {
 914                     // remember "</" , everything else we find in the buffer
 915                     aToken.append( "<" );
 916                     if( bOffState )
 917                         aToken.append( "/" );
 918
 919                     bNextCh = false;
 920                 }
 921             }
 922             break;
 923         case '-':
 924             sTmpBuffer.appendUtf32( nNextCh );
 925             if( bReadComment )
 926             {
 927                 bool bTwoMinus = false;
 928                 nNextCh = GetNextChar();
 929                 while( '-' == nNextCh && IsParserWorking() )
 930                 {
 931                     bTwoMinus = true;
 932                     sTmpBuffer.appendUtf32( nNextCh );
 933                     nNextCh = GetNextChar();
 934                 }
 935
 936                 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
 937                     bReadComment = false;
 938
 939                 bNextCh = false;
 940             }
 941             break;
 942
 943         case '\r':
 944             // \r\n? closes the current text token (even if it's empty)
 945             nNextCh = GetNextChar();
 946             if( nNextCh=='\n' )
 947                 nNextCh = GetNextChar();
 948             bContinue = false;
 949             break;
 950         case '\n':
 951             // \n closes the current text token (even if it's empty)
 952             nNextCh = GetNextChar();
 953             bContinue = false;
 954             break;
 955         case sal_Unicode(EOF):
 956             // eof closes the current text token and behaves like having read
 957             // an end token
 958             if( rInput.eof() )
 959             {
 960                 bContinue = false;
 961                 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
 962                 {
 963                     bEndTokenFound = true;
 964                 }
 965                 else
 966                 {
 967                     bReadScript = false;
 968                     bReadStyle = false;
 969                     aEndToken.clear();
 970                     nToken = HtmlTokenId::NONE;
 971                 }
 972             }
 973             break;
 974         default:
 975             if (!linguistic::IsControlChar(nNextCh) || nNextCh == '\t')
 976             {
 977                 // all remaining characters are appended to the buffer
 978                 sTmpBuffer.appendUtf32( nNextCh );
 979             }
 980             break;
 981         }
 982
 983         if( !bContinue && !sTmpBuffer.isEmpty() )
 984         {
 985             aToken.append( sTmpBuffer );
 986             sTmpBuffer.setLength(0);
 987         }
 988
 989         if( bContinue && bNextCh )
 990             nNextCh = GetNextChar();
 991     }
 992
 993     if( IsParserWorking() )
 994         SaveState( HtmlTokenId::NONE );
 995     else
 996         nToken = HtmlTokenId::NONE;
 997
 998     return nToken;
 999 }
1000
1001 // Scan next token
1002 HtmlTokenId HTMLParser::GetNextToken_()
1003 {
1004     HtmlTokenId nRet = HtmlTokenId::NONE;
1005     sSaveToken.clear();
1006
1007     if (mnPendingOffToken != HtmlTokenId::NONE)
1008     {
1009         // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
1010         nRet = mnPendingOffToken;
1011         mnPendingOffToken = HtmlTokenId::NONE;
1012         aToken.setLength( 0 );
1013         return nRet;
1014     }
1015
1016     // Delete options
1017     maOptions.clear();
1018
1019     if( !IsParserWorking() )        // Don't continue if already an error occurred
1020         return HtmlTokenId::NONE;
1021
1022     bool bReadNextCharSave = bReadNextChar;
1023     if( bReadNextChar )
1024     {
1025         DBG_ASSERT( !bEndTokenFound,
1026                     "Read a character despite </SCRIPT> was read?" );
1027         nNextCh = GetNextChar();
1028         if( !IsParserWorking() )        // Don't continue if already an error occurred
1029             return HtmlTokenId::NONE;
1030         bReadNextChar = false;
1031     }
1032
1033     if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
1034     {
1035         nRet = GetNextRawToken();
1036         if( nRet != HtmlTokenId::NONE || !IsParserWorking() )
1037             return nRet;
1038     }
1039
1040     do {
1041         bool bNextCh = true;
1042         switch( nNextCh )
1043         {
1044         case '<':
1045             {
1046                 sal_uInt64 nStreamPos = rInput.Tell();
1047                 sal_uInt32 nLineNr = GetLineNr();
1048                 sal_uInt32 nLinePos = GetLinePos();
1049
1050                 bool bOffState = false;
1051                 if( '/' == (nNextCh = GetNextChar()) )
1052                 {
1053                     bOffState = true;
1054                     nNextCh = GetNextChar();
1055                 }
1056                 // Assume '<?' is a start of an XML declaration, ignore it.
1057                 if (rtl::isAsciiAlpha(nNextCh) || nNextCh == '!' || nNextCh == '?')
1058                 {
1059                     OUStringBuffer sTmpBuffer;
1060                     do {
1061                         sTmpBuffer.appendUtf32( nNextCh );
1062                         nNextCh = GetNextChar();
1063                         if (std::u16string_view(sTmpBuffer) == u"![CDATA[")
1064                             break;
1065                         if (bFuzzing && sTmpBuffer.getLength() > 1024)
1066                         {
1067                             SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
1068                             eState = SvParserState::Error;
1069                             break;
1070                         }
1071                     } while( '>' != nNextCh && '/' != nNextCh && !rtl::isAsciiWhiteSpace( nNextCh ) &&
1072                             !linguistic::IsControlChar(nNextCh) &&
1073                              IsParserWorking() && !rInput.eof() );
1074
1075                     if( !sTmpBuffer.isEmpty() )
1076                     {
1077                         aToken.append( sTmpBuffer );
1078                         sTmpBuffer.setLength(0);
1079                     }
1080
1081                     // Skip blanks
1082                     while( rtl::isAsciiWhiteSpace( nNextCh ) && IsParserWorking() )
1083                         nNextCh = GetNextChar();
1084
1085                     if( !IsParserWorking() )
1086                     {
1087                         if( SvParserState::Pending == eState )
1088                             bReadNextChar = bReadNextCharSave;
1089                         break;
1090                     }
1091
1092                     // Search token in table:
1093                     sSaveToken = aToken;
1094                     aToken = aToken.toString().toAsciiLowerCase();
1095
1096                     if (!maNamespace.isEmpty() && o3tl::starts_with(aToken, maNamespace))
1097                         aToken.remove( 0, maNamespace.getLength());
1098
1099                     if( HtmlTokenId::NONE == (nRet = GetHTMLToken( aToken )) )
1100                         // Unknown control
1101                         nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1102
1103                     // If it's a token which can be switched off...
1104                     if( bOffState )
1105                     {
1106                          if( nRet >= HtmlTokenId::ONOFF_START )
1107                          {
1108                             // and there is an off token, return off token instead
1109                             nRet = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);
1110                          }
1111                          else if( HtmlTokenId::LINEBREAK!=nRet || !maNamespace.isEmpty())
1112                          {
1113                             // and there is no off token, return unknown token.
1114                             // (except for </BR>, that is treated like <BR>)
1115                             // No exception for XHTML, though.
1116                             nRet = HtmlTokenId::UNKNOWNCONTROL_OFF;
1117                          }
1118                     }
1119
1120                     if( nRet == HtmlTokenId::COMMENT )
1121                     {
1122                         // fix: due to being case sensitive use sSaveToken as start of comment
1123                         //      and append a blank.
1124                         aToken = sSaveToken;
1125                         if( '>'!=nNextCh )
1126                             aToken.append( " " );
1127                         sal_uInt64 nCStreamPos = 0;
1128                         sal_uInt32 nCLineNr = 0;
1129                         sal_uInt32 nCLinePos = 0;
1130                         sal_Int32 nCStrLen = 0;
1131
1132                         bool bDone = false;
1133                         // Read until closing -->. If not found restart at first >
1134                         sTmpBuffer = aToken;
1135                         while( !bDone && !rInput.eof() && IsParserWorking() )
1136                         {
1137                             if( '>'==nNextCh )
1138                             {
1139                                 if( !nCStreamPos )
1140                                 {
1141                                     nCStreamPos = rInput.Tell();
1142                                     nCStrLen = sTmpBuffer.getLength();
1143                                     nCLineNr = GetLineNr();
1144                                     nCLinePos = GetLinePos();
1145                                 }
1146                                 bDone = sTmpBuffer.getLength() >= 2 && sTmpBuffer[sTmpBuffer.getLength() - 2] == '-' && sTmpBuffer[sTmpBuffer.getLength() - 1] == '-';
1147                                 if( !bDone )
1148                                     sTmpBuffer.appendUtf32(nNextCh);
1149                             }
1150                             else if (!linguistic::IsControlChar(nNextCh)
1151                                 || nNextCh == '\r' || nNextCh == '\n' || nNextCh == '\t')
1152                             {
1153                                 sTmpBuffer.appendUtf32(nNextCh);
1154                             }
1155                             if( !bDone )
1156                                 nNextCh = GetNextChar();
1157                         }
1158                         aToken = sTmpBuffer;
1159                         sTmpBuffer.setLength(0);
1160                         if( !bDone && IsParserWorking() && nCStreamPos )
1161                         {
1162                             rInput.Seek( nCStreamPos );
1163                             SetLineNr( nCLineNr );
1164                             SetLinePos( nCLinePos );
1165                             ClearTxtConvContext();
1166                             aToken.truncate(nCStrLen);
1167                             nNextCh = '>';
1168                         }
1169                     }
1170                     else if (nRet == HtmlTokenId::CDATA)
1171                     {
1172                         // Read until the closing ]]>.
1173                         bool bDone = false;
1174                         while (!bDone && !rInput.eof() && IsParserWorking())
1175                         {
1176                             if (nNextCh == '>')
1177                             {
1178                                 if (sTmpBuffer.getLength() >= 2)
1179                                 {
1180                                     bDone = sTmpBuffer[sTmpBuffer.getLength() - 2] == ']'
1181                                             && sTmpBuffer[sTmpBuffer.getLength() - 1] == ']';
1182                                     if (bDone)
1183                                     {
1184                                         // Ignore ]] at the end.
1185                                         sTmpBuffer.setLength(sTmpBuffer.getLength() - 2);
1186                                     }
1187                                 }
1188                                 if (!bDone)
1189                                 {
1190                                     sTmpBuffer.appendUtf32(nNextCh);
1191                                 }
1192                             }
1193                             else if (!linguistic::IsControlChar(nNextCh))
1194                             {
1195                                 sTmpBuffer.appendUtf32(nNextCh);
1196                             }
1197                             if (!bDone)
1198                             {
1199                                 nNextCh = GetNextChar();
1200                             }
1201                         }
1202                         aToken = sTmpBuffer;
1203                         sTmpBuffer.setLength(0);
1204                     }
1205                     else
1206                     {
1207                         // TokenString not needed anymore
1208                         aToken.setLength( 0 );
1209                     }
1210
1211                     // Read until closing '>'
1212                     if( '>' != nNextCh && IsParserWorking() )
1213                     {
1214                         ScanText( '>' );
1215
1216                         // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1217                         // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
1218                         // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
1219                         // which lead to fdo#56772.
1220                         if ((nRet >= HtmlTokenId::ONOFF_START) && o3tl::ends_with(aToken, u"/"))
1221                         {
1222                             mnPendingOffToken = static_cast<HtmlTokenId>(static_cast<int>(nRet) + 1);       // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
1223                             aToken.setLength( aToken.getLength()-1 );   // remove trailing '/'
1224                         }
1225                         if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1226                         {
1227                             // Move back in front of < and restart there.
1228                             // Return < as text.
1229                             rInput.Seek( nStreamPos );
1230                             SetLineNr( nLineNr );
1231                             SetLinePos( nLinePos );
1232                             ClearTxtConvContext();
1233
1234                             aToken = "<";
1235                             nRet = HtmlTokenId::TEXTTOKEN;
1236                             nNextCh = GetNextChar();
1237                             bNextCh = false;
1238                             break;
1239                         }
1240                     }
1241                     if( SvParserState::Pending == eState )
1242                         bReadNextChar = bReadNextCharSave;
1243                 }
1244                 else
1245                 {
1246                     if( bOffState )
1247                     {
1248                         // simply throw away everything
1249                         ScanText( '>' );
1250                         if( sal_Unicode(EOF) == nNextCh && rInput.eof() )
1251                         {
1252                             // Move back in front of < and restart there.
1253                             // Return < as text.
1254                             rInput.Seek( nStreamPos );
1255                             SetLineNr( nLineNr );
1256                             SetLinePos( nLinePos );
1257                             ClearTxtConvContext();
1258
1259                             aToken = "<";
1260                             nRet = HtmlTokenId::TEXTTOKEN;
1261                             nNextCh = GetNextChar();
1262                             bNextCh = false;
1263                             break;
1264                         }
1265                         if( SvParserState::Pending == eState )
1266                             bReadNextChar = bReadNextCharSave;
1267                         aToken.setLength( 0 );
1268                     }
1269                     else if( '%' == nNextCh )
1270                     {
1271                         nRet = HtmlTokenId::UNKNOWNCONTROL_ON;
1272
1273                         sal_uInt64 nCStreamPos = rInput.Tell();
1274                         sal_uInt32 nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1275
1276                         bool bDone = false;
1277                         // Read until closing %>. If not found restart at first >.
1278                         sal_Unicode nLastTokenChar = !aToken.isEmpty() ? aToken[aToken.getLength() - 1] : 0;
1279                         OUStringBuffer aTmpBuffer(aToken);
1280                         while( !bDone && !rInput.eof() && IsParserWorking() )
1281                         {
1282                             bDone = '>'==nNextCh && nLastTokenChar == '%';
1283                             if( !bDone )
1284                             {
1285                                 aTmpBuffer.appendUtf32(nNextCh);
1286                                 nLastTokenChar = aTmpBuffer[aTmpBuffer.getLength() - 1];
1287                                 nNextCh = GetNextChar();
1288                             }
1289                         }
1290                         if( !bDone && IsParserWorking() )
1291                         {
1292                             rInput.Seek( nCStreamPos );
1293                             SetLineNr( nCLineNr );
1294                             SetLinePos( nCLinePos );
1295                             ClearTxtConvContext();
1296                             aToken = "<%";
1297                             nRet = HtmlTokenId::TEXTTOKEN;
1298                             break;
1299                         }
1300                         aToken = aTmpBuffer;
1301                         aTmpBuffer.setLength(0);
1302                         if( IsParserWorking() )
1303                         {
1304                             sSaveToken = aToken;
1305                             aToken.setLength( 0 );
1306                         }
1307                     }
1308                     else
1309                     {
1310                         aToken = "<";
1311                         nRet = HtmlTokenId::TEXTTOKEN;
1312                         bNextCh = false;
1313                         break;
1314                     }
1315                 }
1316
1317                 if( IsParserWorking() )
1318                 {
1319                     bNextCh = '>' == nNextCh;
1320                     switch( nRet )
1321                     {
1322                     case HtmlTokenId::TEXTAREA_ON:
1323                         bReadTextArea = true;
1324                         break;
1325                     case HtmlTokenId::TEXTAREA_OFF:
1326                         bReadTextArea = false;
1327                         break;
1328                     case HtmlTokenId::SCRIPT_ON:
1329                         if( !bReadTextArea )
1330                             bReadScript = true;
1331                         break;
1332                     case HtmlTokenId::SCRIPT_OFF:
1333                         if( !bReadTextArea )
1334                         {
1335                             bReadScript = false;
1336                             // JavaScript might modify the stream,
1337                             // thus the last character has to be read again.
1338                             bReadNextChar = true;
1339                             bNextCh = false;
1340                         }
1341                         break;
1342
1343                     case HtmlTokenId::STYLE_ON:
1344                         bReadStyle = true;
1345                         break;
1346                     case HtmlTokenId::STYLE_OFF:
1347                         bReadStyle = false;
1348                         break;
1349                     default: break;
1350                     }
1351                 }
1352             }
1353             break;
1354
1355         case sal_Unicode(EOF):
1356             if( rInput.eof() )
1357             {
1358                 eState = SvParserState::Accepted;
1359                 nRet = HtmlTokenId(nNextCh);
1360             }
1361             else
1362             {
1363                 // Read normal text.
1364                 goto scan_text;
1365             }
1366             break;
1367
1368         case '\f':
1369             // form feeds are passed upwards separately
1370             nRet = HtmlTokenId::LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
1371             break;
1372
1373         case '\n':
1374         case '\r':
1375             if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1376             {
1377                 sal_Unicode c = GetNextChar();
1378                 if( ( '\n' != nNextCh || '\r' != c ) &&
1379                     ( '\r' != nNextCh || '\n' != c ) )
1380                 {
1381                     bNextCh = false;
1382                     nNextCh = c;
1383                 }
1384                 nRet = HtmlTokenId::NEWPARA;
1385                 break;
1386             }
1387             [[fallthrough]];
1388         case '\t':
1389             if( bReadPRE )
1390             {
1391                 nRet = HtmlTokenId::TABCHAR;
1392                 break;
1393             }
1394             [[fallthrough]];
1395         case ' ':
1396             [[fallthrough]];
1397         default:
1398
1399 scan_text:
1400             // "normal" text to come
1401             nRet = ScanText();
1402             bNextCh = 0 == aToken.getLength();
1403
1404             // the text should be processed
1405             if( !bNextCh && eState == SvParserState::Pending )
1406             {
1407                 eState = SvParserState::Working;
1408                 bReadNextChar = true;
1409             }
1410
1411             break;
1412         }
1413
1414         if( bNextCh && SvParserState::Working == eState )
1415         {
1416             nNextCh = GetNextChar();
1417             if( SvParserState::Pending == eState && nRet != HtmlTokenId::NONE && HtmlTokenId::TEXTTOKEN != nRet )
1418             {
1419                 bReadNextChar = true;
1420                 eState = SvParserState::Working;
1421             }
1422         }
1423
1424     } while( nRet == HtmlTokenId::NONE && SvParserState::Working == eState );
1425
1426     if( SvParserState::Pending == eState )
1427         nRet = HtmlTokenId::INVALID;      // s.th. invalid
1428
1429     return nRet;
1430 }
1431
1432 void HTMLParser::UnescapeToken()
1433 {
1434     sal_Int32 nPos=0;
1435
1436     bool bEscape = false;
1437     while( nPos < aToken.getLength() )
1438     {
1439         bool bOldEscape = bEscape;
1440         bEscape = false;
1441         if( '\\'==aToken[nPos] && !bOldEscape )
1442         {
1443             aToken.remove( nPos, 1 );
1444             bEscape = true;
1445         }
1446         else
1447         {
1448             nPos++;
1449         }
1450     }
1451 }
1452
1453 const HTMLOptions& HTMLParser::GetOptions( HtmlOptionId const *pNoConvertToken )
1454 {
1455     // If the options for the current token have already been returned,
1456     // return them once again.
1457     if (!maOptions.empty())
1458         return maOptions;
1459
1460     sal_Int32 nPos = 0;
1461     while( nPos < aToken.getLength() )
1462     {
1463         // A letter? Option beginning here.
1464         if( rtl::isAsciiAlpha( aToken[nPos] ) )
1465         {
1466             HtmlOptionId nToken;
1467             OUString aValue;
1468             sal_Int32 nStt = nPos;
1469             sal_Unicode cChar = 0;
1470
1471             // Actually only certain characters allowed.
1472             // Netscape only looks for "=" and white space (c.f.
1473             // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
1474             while( nPos < aToken.getLength() )
1475             {
1476                 cChar = aToken[nPos];
1477                 if ( '=' == cChar ||!HTML_ISPRINTABLE(cChar) || rtl::isAsciiWhiteSpace(cChar) )
1478                     break;
1479                 nPos++;
1480             }
1481
1482             OUString sName( aToken.subView( nStt, nPos-nStt ) );
1483
1484             // PlugIns require original token name. Convert to lower case only for searching.
1485             nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
1486             SAL_WARN_IF( nToken==HtmlOptionId::UNKNOWN, "svtools",
1487                         "GetOption: unknown HTML option '" << sName << "'" );
1488             bool bStripCRLF = (nToken < HtmlOptionId::SCRIPT_START ||
1489                                nToken >= HtmlOptionId::SCRIPT_END) &&
1490                               (!pNoConvertToken || nToken != *pNoConvertToken);
1491
1492             while( nPos < aToken.getLength() )
1493             {
1494                 cChar = aToken[nPos];
1495                 if ( HTML_ISPRINTABLE(cChar) && !rtl::isAsciiWhiteSpace(cChar) )
1496                     break;
1497                 nPos++;
1498             }
1499
1500             // Option with value?
1501             if( nPos!=aToken.getLength() && '='==cChar )
1502             {
1503                 nPos++;
1504
1505                 while( nPos < aToken.getLength() )
1506                 {
1507                     cChar = aToken[nPos];
1508                     if ( HTML_ISPRINTABLE(cChar) && ' ' != cChar && '\t' != cChar && '\r' != cChar && '\n' != cChar )
1509                         break;
1510                     nPos++;
1511                 }
1512
1513                 if( nPos != aToken.getLength() )
1514                 {
1515                     sal_Int32 nLen = 0;
1516                     nStt = nPos;
1517                     if( ('"'==cChar) || '\''==cChar )
1518                     {
1519                         sal_Unicode cEnd = cChar;
1520                         nPos++; nStt++;
1521                         bool bDone = false;
1522                         bool bEscape = false;
1523                         while( nPos < aToken.getLength() && !bDone )
1524                         {
1525                             bool bOldEscape = bEscape;
1526                             bEscape = false;
1527                             cChar = aToken[nPos];
1528                             switch( cChar )
1529                             {
1530                             case '\r':
1531                             case '\n':
1532                                 if( bStripCRLF )
1533                                     aToken.remove( nPos, 1 );
1534                                 else
1535                                 {
1536                                     nPos++;
1537                                     nLen++;
1538                                 }
1539                                 break;
1540                             case '\\':
1541                                 if( bOldEscape )
1542                                 {
1543                                     nPos++;
1544                                     nLen++;
1545                                 }
1546                                 else
1547                                 {
1548                                     aToken.remove( nPos, 1 );
1549                                     bEscape = true;
1550                                 }
1551                                 break;
1552                             case '"':
1553                             case '\'':
1554                                 bDone = !bOldEscape && cChar==cEnd;
1555                                 if( !bDone )
1556                                 {
1557                                     nPos++;
1558                                     nLen++;
1559                                 }
1560                                 break;
1561                             default:
1562                                 nPos++;
1563                                 nLen++;
1564                                 break;
1565                             }
1566                         }
1567                         if( nPos!=aToken.getLength() )
1568                             nPos++;
1569                     }
1570                     else
1571                     {
1572                         // More liberal than the standard: allow all printable characters
1573                         bool bEscape = false;
1574                         bool bDone = false;
1575                         while( nPos < aToken.getLength() && !bDone )
1576                         {
1577                             bool bOldEscape = bEscape;
1578                             bEscape = false;
1579                             sal_Unicode c = aToken[nPos];
1580                             switch( c )
1581                             {
1582                             case ' ':
1583                                 bDone = !bOldEscape;
1584                                 if( !bDone )
1585                                 {
1586                                     nPos++;
1587                                     nLen++;
1588                                 }
1589                                 break;
1590
1591                             case '\t':
1592                             case '\r':
1593                             case '\n':
1594                                 bDone = true;
1595                                 break;
1596
1597                             case '\\':
1598                                 if( bOldEscape )
1599                                 {
1600                                     nPos++;
1601                                     nLen++;
1602                                 }
1603                                 else
1604                                 {
1605                                     aToken.remove( nPos, 1 );
1606                                     bEscape = true;
1607                                 }
1608                                 break;
1609
1610                             default:
1611                                 if( HTML_ISPRINTABLE( c ) )
1612                                 {
1613                                     nPos++;
1614                                     nLen++;
1615                                 }
1616                                 else
1617                                     bDone = true;
1618                                 break;
1619                             }
1620                         }
1621                     }
1622
1623                     if( nLen )
1624                         aValue = aToken.subView( nStt, nLen );
1625                 }
1626             }
1627
1628             // Token is known and can be saved
1629             maOptions.emplace_back(nToken, sName, aValue);
1630
1631         }
1632         else
1633             // Ignore white space and unexpected characters
1634             nPos++;
1635     }
1636
1637     return maOptions;
1638 }
1639
1640 HtmlTokenId HTMLParser::FilterPRE( HtmlTokenId nToken )
1641 {
1642     switch( nToken )
1643     {
1644     // in Netscape they only have impact in not empty paragraphs
1645     case HtmlTokenId::PARABREAK_ON:
1646         nToken = HtmlTokenId::LINEBREAK;
1647         [[fallthrough]];
1648     case HtmlTokenId::LINEBREAK:
1649     case HtmlTokenId::NEWPARA:
1650         nPre_LinePos = 0;
1651         if( bPre_IgnoreNewPara )
1652             nToken = HtmlTokenId::NONE;
1653         break;
1654
1655     case HtmlTokenId::TABCHAR:
1656         {
1657             sal_Int32 nSpaces = 8 - (nPre_LinePos % 8);
1658             DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
1659             if (aToken.getLength() < nSpaces)
1660             {
1661                 using comphelper::string::padToLength;
1662                 OUStringBuffer aBuf(aToken);
1663                 aToken = padToLength(aBuf, nSpaces, ' ');
1664             }
1665             nPre_LinePos += nSpaces;
1666             nToken = HtmlTokenId::TEXTTOKEN;
1667         }
1668         break;
1669     // Keep those
1670     case HtmlTokenId::TEXTTOKEN:
1671         nPre_LinePos += aToken.getLength();
1672         break;
1673
1674     case HtmlTokenId::SELECT_ON:
1675     case HtmlTokenId::SELECT_OFF:
1676     case HtmlTokenId::BODY_ON:
1677     case HtmlTokenId::FORM_ON:
1678     case HtmlTokenId::FORM_OFF:
1679     case HtmlTokenId::INPUT:
1680     case HtmlTokenId::OPTION:
1681     case HtmlTokenId::TEXTAREA_ON:
1682     case HtmlTokenId::TEXTAREA_OFF:
1683
1684     case HtmlTokenId::IMAGE:
1685     case HtmlTokenId::APPLET_ON:
1686     case HtmlTokenId::APPLET_OFF:
1687     case HtmlTokenId::PARAM:
1688     case HtmlTokenId::EMBED:
1689
1690     case HtmlTokenId::HEAD1_ON:
1691     case HtmlTokenId::HEAD1_OFF:
1692     case HtmlTokenId::HEAD2_ON:
1693     case HtmlTokenId::HEAD2_OFF:
1694     case HtmlTokenId::HEAD3_ON:
1695     case HtmlTokenId::HEAD3_OFF:
1696     case HtmlTokenId::HEAD4_ON:
1697     case HtmlTokenId::HEAD4_OFF:
1698     case HtmlTokenId::HEAD5_ON:
1699     case HtmlTokenId::HEAD5_OFF:
1700     case HtmlTokenId::HEAD6_ON:
1701     case HtmlTokenId::HEAD6_OFF:
1702     case HtmlTokenId::BLOCKQUOTE_ON:
1703     case HtmlTokenId::BLOCKQUOTE_OFF:
1704     case HtmlTokenId::ADDRESS_ON:
1705     case HtmlTokenId::ADDRESS_OFF:
1706     case HtmlTokenId::HORZRULE:
1707
1708     case HtmlTokenId::CENTER_ON:
1709     case HtmlTokenId::CENTER_OFF:
1710     case HtmlTokenId::DIVISION_ON:
1711     case HtmlTokenId::DIVISION_OFF:
1712
1713     case HtmlTokenId::SCRIPT_ON:
1714     case HtmlTokenId::SCRIPT_OFF:
1715     case HtmlTokenId::RAWDATA:
1716
1717     case HtmlTokenId::TABLE_ON:
1718     case HtmlTokenId::TABLE_OFF:
1719     case HtmlTokenId::CAPTION_ON:
1720     case HtmlTokenId::CAPTION_OFF:
1721     case HtmlTokenId::COLGROUP_ON:
1722     case HtmlTokenId::COLGROUP_OFF:
1723     case HtmlTokenId::COL_ON:
1724     case HtmlTokenId::COL_OFF:
1725     case HtmlTokenId::THEAD_ON:
1726     case HtmlTokenId::THEAD_OFF:
1727     case HtmlTokenId::TFOOT_ON:
1728     case HtmlTokenId::TFOOT_OFF:
1729     case HtmlTokenId::TBODY_ON:
1730     case HtmlTokenId::TBODY_OFF:
1731     case HtmlTokenId::TABLEROW_ON:
1732     case HtmlTokenId::TABLEROW_OFF:
1733     case HtmlTokenId::TABLEDATA_ON:
1734     case HtmlTokenId::TABLEDATA_OFF:
1735     case HtmlTokenId::TABLEHEADER_ON:
1736     case HtmlTokenId::TABLEHEADER_OFF:
1737
1738     case HtmlTokenId::ANCHOR_ON:
1739     case HtmlTokenId::ANCHOR_OFF:
1740     case HtmlTokenId::BOLD_ON:
1741     case HtmlTokenId::BOLD_OFF:
1742     case HtmlTokenId::ITALIC_ON:
1743     case HtmlTokenId::ITALIC_OFF:
1744     case HtmlTokenId::STRIKE_ON:
1745     case HtmlTokenId::STRIKE_OFF:
1746     case HtmlTokenId::STRIKETHROUGH_ON:
1747     case HtmlTokenId::STRIKETHROUGH_OFF:
1748     case HtmlTokenId::UNDERLINE_ON:
1749     case HtmlTokenId::UNDERLINE_OFF:
1750     case HtmlTokenId::BASEFONT_ON:
1751     case HtmlTokenId::BASEFONT_OFF:
1752     case HtmlTokenId::FONT_ON:
1753     case HtmlTokenId::FONT_OFF:
1754     case HtmlTokenId::BLINK_ON:
1755     case HtmlTokenId::BLINK_OFF:
1756     case HtmlTokenId::SPAN_ON:
1757     case HtmlTokenId::SPAN_OFF:
1758     case HtmlTokenId::SUBSCRIPT_ON:
1759     case HtmlTokenId::SUBSCRIPT_OFF:
1760     case HtmlTokenId::SUPERSCRIPT_ON:
1761     case HtmlTokenId::SUPERSCRIPT_OFF:
1762     case HtmlTokenId::BIGPRINT_ON:
1763     case HtmlTokenId::BIGPRINT_OFF:
1764     case HtmlTokenId::SMALLPRINT_OFF:
1765     case HtmlTokenId::SMALLPRINT_ON:
1766
1767     case HtmlTokenId::EMPHASIS_ON:
1768     case HtmlTokenId::EMPHASIS_OFF:
1769     case HtmlTokenId::CITATION_ON:
1770     case HtmlTokenId::CITATION_OFF:
1771     case HtmlTokenId::STRONG_ON:
1772     case HtmlTokenId::STRONG_OFF:
1773     case HtmlTokenId::CODE_ON:
1774     case HtmlTokenId::CODE_OFF:
1775     case HtmlTokenId::SAMPLE_ON:
1776     case HtmlTokenId::SAMPLE_OFF:
1777     case HtmlTokenId::KEYBOARD_ON:
1778     case HtmlTokenId::KEYBOARD_OFF:
1779     case HtmlTokenId::VARIABLE_ON:
1780     case HtmlTokenId::VARIABLE_OFF:
1781     case HtmlTokenId::DEFINSTANCE_ON:
1782     case HtmlTokenId::DEFINSTANCE_OFF:
1783     case HtmlTokenId::SHORTQUOTE_ON:
1784     case HtmlTokenId::SHORTQUOTE_OFF:
1785     case HtmlTokenId::LANGUAGE_ON:
1786     case HtmlTokenId::LANGUAGE_OFF:
1787     case HtmlTokenId::AUTHOR_ON:
1788     case HtmlTokenId::AUTHOR_OFF:
1789     case HtmlTokenId::PERSON_ON:
1790     case HtmlTokenId::PERSON_OFF:
1791     case HtmlTokenId::ACRONYM_ON:
1792     case HtmlTokenId::ACRONYM_OFF:
1793     case HtmlTokenId::ABBREVIATION_ON:
1794     case HtmlTokenId::ABBREVIATION_OFF:
1795     case HtmlTokenId::INSERTEDTEXT_ON:
1796     case HtmlTokenId::INSERTEDTEXT_OFF:
1797     case HtmlTokenId::DELETEDTEXT_ON:
1798     case HtmlTokenId::DELETEDTEXT_OFF:
1799     case HtmlTokenId::TELETYPE_ON:
1800     case HtmlTokenId::TELETYPE_OFF:
1801
1802         break;
1803
1804     // The remainder is treated as an unknown token.
1805     default:
1806         if( nToken != HtmlTokenId::NONE )
1807         {
1808             nToken =
1809                 ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1810                     ? HtmlTokenId::UNKNOWNCONTROL_OFF
1811                     : HtmlTokenId::UNKNOWNCONTROL_ON );
1812         }
1813         break;
1814     }
1815
1816     bPre_IgnoreNewPara = false;
1817
1818     return nToken;
1819 }
1820
1821 HtmlTokenId HTMLParser::FilterXMP( HtmlTokenId nToken )
1822 {
1823     switch( nToken )
1824     {
1825     case HtmlTokenId::NEWPARA:
1826         if( bPre_IgnoreNewPara )
1827             nToken = HtmlTokenId::NONE;
1828         [[fallthrough]];
1829     case HtmlTokenId::TEXTTOKEN:
1830     case HtmlTokenId::NONBREAKSPACE:
1831     case HtmlTokenId::SOFTHYPH:
1832         break;              // kept
1833
1834     default:
1835         if( nToken != HtmlTokenId::NONE )
1836         {
1837             if( (nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken) )
1838             {
1839                 sSaveToken = "</" + sSaveToken;
1840             }
1841             else
1842                 sSaveToken = "<" + sSaveToken;
1843             if( !aToken.isEmpty() )
1844             {
1845                 UnescapeToken();
1846                 sSaveToken += " ";
1847                 aToken.insert(0, sSaveToken);
1848             }
1849             else
1850                 aToken = sSaveToken;
1851             aToken.append( ">" );
1852             nToken = HtmlTokenId::TEXTTOKEN;
1853         }
1854         break;
1855     }
1856
1857     bPre_IgnoreNewPara = false;
1858
1859     return nToken;
1860 }
1861
1862 HtmlTokenId HTMLParser::FilterListing( HtmlTokenId nToken )
1863 {
1864     switch( nToken )
1865     {
1866     case HtmlTokenId::NEWPARA:
1867         if( bPre_IgnoreNewPara )
1868             nToken = HtmlTokenId::NONE;
1869         [[fallthrough]];
1870     case HtmlTokenId::TEXTTOKEN:
1871     case HtmlTokenId::NONBREAKSPACE:
1872     case HtmlTokenId::SOFTHYPH:
1873         break;      // kept
1874
1875     default:
1876         if( nToken != HtmlTokenId::NONE )
1877         {
1878             nToken =
1879                 ( ((nToken >= HtmlTokenId::ONOFF_START) && isOffToken(nToken))
1880                     ? HtmlTokenId::UNKNOWNCONTROL_OFF
1881                     : HtmlTokenId::UNKNOWNCONTROL_ON );
1882         }
1883         break;
1884     }
1885
1886     bPre_IgnoreNewPara = false;
1887
1888     return nToken;
1889 }
1890
1891 bool HTMLParser::InternalImgToPrivateURL( OUString& rURL )
1892 {
1893     bool bFound = false;
1894
1895     if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) )
1896     {
1897         OUString aName( rURL.copy(14) );
1898         switch( aName[0] )
1899         {
1900         case 'b':
1901             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata;
1902             break;
1903         case 'd':
1904             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed;
1905             break;
1906         case 'e':
1907             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed;
1908             break;
1909         case 'i':
1910             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure;
1911             break;
1912         case 'n':
1913             bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound;
1914             break;
1915         }
1916     }
1917     if( bFound )
1918     {
1919         OUString sTmp ( rURL );
1920         rURL =  OOO_STRING_SVTOOLS_HTML_private_image;
1921         rURL += sTmp;
1922     }
1923
1924     return bFound;
1925 }
1926
1927 namespace {
1928
1929 enum class HtmlMeta {
1930     NONE = 0,
1931     Author,
1932     Description,
1933     Keywords,
1934     Refresh,
1935     Classification,
1936     Created,
1937     ChangedBy,
1938     Changed,
1939     Generator,
1940     SDFootnote,
1941     SDEndnote,
1942     ContentType
1943 };
1944
1945 }
1946
1947 // <META NAME=xxx>
1948 HTMLOptionEnum<HtmlMeta> const aHTMLMetaNameTable[] =
1949 {
1950     { OOO_STRING_SVTOOLS_HTML_META_author,        HtmlMeta::Author        },
1951     { OOO_STRING_SVTOOLS_HTML_META_changed,       HtmlMeta::Changed       },
1952     { OOO_STRING_SVTOOLS_HTML_META_changedby,     HtmlMeta::ChangedBy     },
1953     { OOO_STRING_SVTOOLS_HTML_META_classification,HtmlMeta::Classification},
1954     { OOO_STRING_SVTOOLS_HTML_META_content_type,  HtmlMeta::ContentType   },
1955     { OOO_STRING_SVTOOLS_HTML_META_created,       HtmlMeta::Created       },
1956     { OOO_STRING_SVTOOLS_HTML_META_description,   HtmlMeta::Description   },
1957     { OOO_STRING_SVTOOLS_HTML_META_keywords,      HtmlMeta::Keywords      },
1958     { OOO_STRING_SVTOOLS_HTML_META_generator,     HtmlMeta::Generator     },
1959     { OOO_STRING_SVTOOLS_HTML_META_refresh,       HtmlMeta::Refresh       },
1960     { OOO_STRING_SVTOOLS_HTML_META_sdendnote,     HtmlMeta::SDEndnote     },
1961     { OOO_STRING_SVTOOLS_HTML_META_sdfootnote,    HtmlMeta::SDFootnote    },
1962     { nullptr,                                    HtmlMeta(0)             }
1963 };
1964
1965
1966 void HTMLParser::AddMetaUserDefined( OUString const & )
1967 {
1968 }
1969
1970 bool HTMLParser::ParseMetaOptionsImpl(
1971         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
1972         SvKeyValueIterator *i_pHTTPHeader,
1973         const HTMLOptions& aOptions,
1974         rtl_TextEncoding& o_rEnc )
1975 {
1976     OUString aName, aContent;
1977     HtmlMeta nAction = HtmlMeta::NONE;
1978     bool bHTTPEquiv = false, bChanged = false;
1979
1980     for ( size_t i = aOptions.size(); i; )
1981     {
1982         const HTMLOption& aOption = aOptions[--i];
1983         switch ( aOption.GetToken() )
1984         {
1985             case HtmlOptionId::NAME:
1986                 aName = aOption.GetString();
1987                 if ( HtmlMeta::NONE==nAction )
1988                 {
1989                     aOption.GetEnum( nAction, aHTMLMetaNameTable );
1990                 }
1991                 break;
1992             case HtmlOptionId::HTTPEQUIV:
1993                 aName = aOption.GetString();
1994                 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1995                 bHTTPEquiv = true;
1996                 break;
1997             case HtmlOptionId::CONTENT:
1998                 aContent = aOption.GetString();
1999                 break;
2000             case HtmlOptionId::CHARSET:
2001             {
2002                 OString sValue(OUStringToOString(aOption.GetString(), RTL_TEXTENCODING_ASCII_US));
2003                 o_rEnc = GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue.getStr()));
2004                 break;
2005             }
2006             default: break;
2007         }
2008     }
2009
2010     if ( bHTTPEquiv || HtmlMeta::Description != nAction )
2011     {
2012         // if it is not a Description, remove CRs and LFs from CONTENT
2013         aContent = aContent.replaceAll("\r", "").replaceAll("\n", "");
2014     }
2015     else
2016     {
2017         // convert line endings for Description
2018         aContent = convertLineEnd(aContent, GetSystemLineEnd());
2019     }
2020
2021     if ( bHTTPEquiv && i_pHTTPHeader )
2022     {
2023         // Netscape seems to just ignore a closing ", so we do too
2024         if ( aContent.endsWith("\"") )
2025         {
2026             aContent = aContent.copy( 0, aContent.getLength() - 1 );
2027         }
2028         SvKeyValue aKeyValue( aName, aContent );
2029         i_pHTTPHeader->Append( aKeyValue );
2030     }
2031
2032     switch ( nAction )
2033     {
2034         case HtmlMeta::Author:
2035             if (i_xDocProps.is()) {
2036                 i_xDocProps->setAuthor( aContent );
2037                 bChanged = true;
2038             }
2039             break;
2040         case HtmlMeta::Description:
2041             if (i_xDocProps.is()) {
2042                 i_xDocProps->setDescription( aContent );
2043                 bChanged = true;
2044             }
2045             break;
2046         case HtmlMeta::Keywords:
2047             if (i_xDocProps.is()) {
2048                 i_xDocProps->setKeywords(
2049                     ::comphelper::string::convertCommaSeparated(aContent));
2050                 bChanged = true;
2051             }
2052             break;
2053         case HtmlMeta::Classification:
2054             if (i_xDocProps.is()) {
2055                 i_xDocProps->setSubject( aContent );
2056                 bChanged = true;
2057             }
2058             break;
2059
2060         case HtmlMeta::ChangedBy:
2061             if (i_xDocProps.is()) {
2062                 i_xDocProps->setModifiedBy( aContent );
2063                 bChanged = true;
2064             }
2065             break;
2066
2067         case HtmlMeta::Created:
2068         case HtmlMeta::Changed:
2069             if (i_xDocProps.is() && !aContent.isEmpty())
2070             {
2071                 ::util::DateTime uDT;
2072                 bool valid = false;
2073                 if (comphelper::string::getTokenCount(aContent, ';') == 2)
2074                 {
2075                     sal_Int32 nIdx{ 0 };
2076                     sal_Int32 nDate = o3tl::toInt32(o3tl::getToken(aContent, 0, ';', nIdx));
2077                     sal_Int64 nTime = o3tl::toInt64(o3tl::getToken(aContent, 0, ';', nIdx));
2078                     valid = nDate != std::numeric_limits<sal_Int32>::min() &&
2079                             nTime != std::numeric_limits<sal_Int64>::min();
2080                     if (valid)
2081                     {
2082                         Date aDate(nDate);
2083                         tools::Time aTime(tools::Time::fromEncodedTime(nTime));
2084                         uDT = DateTime(aDate, aTime).GetUNODateTime();
2085                     }
2086                 }
2087                 else if (utl::ISO8601parseDateTime(aContent, uDT))
2088                     valid = true;
2089
2090                 if (valid)
2091                 {
2092                     bChanged = true;
2093                     if (HtmlMeta::Created == nAction)
2094                         i_xDocProps->setCreationDate(uDT);
2095                     else
2096                         i_xDocProps->setModificationDate(uDT);
2097                 }
2098             }
2099             break;
2100
2101         case HtmlMeta::Refresh:
2102             DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, "Lost Reload-URL because of omitted MUST change." );
2103             break;
2104
2105         case HtmlMeta::ContentType:
2106             if ( !aContent.isEmpty() )
2107             {
2108                 o_rEnc = GetEncodingByMIME( aContent );
2109             }
2110             break;
2111
2112         case HtmlMeta::NONE:
2113             if ( !bHTTPEquiv )
2114             {
2115                 if (i_xDocProps.is())
2116                 {
2117                     uno::Reference<beans::XPropertyContainer> xUDProps
2118                         = i_xDocProps->getUserDefinedProperties();
2119                     try {
2120                         xUDProps->addProperty(aName,
2121                             beans::PropertyAttribute::REMOVABLE,
2122                             uno::Any(aContent));
2123                         AddMetaUserDefined(aName);
2124                         bChanged = true;
2125                     } catch (uno::Exception &) {
2126                         // ignore
2127                     }
2128                 }
2129             }
2130             break;
2131         default:
2132             break;
2133     }
2134
2135     return bChanged;
2136 }
2137
2138 bool HTMLParser::ParseMetaOptions(
2139         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2140         SvKeyValueIterator *i_pHeader )
2141 {
2142     HtmlOptionId nContentOption = HtmlOptionId::CONTENT;
2143     rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2144
2145     bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2146                       GetOptions(&nContentOption),
2147                       eEnc );
2148
2149     // If the encoding is set by a META tag, it may only overwrite the
2150     // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2151     // encodings. Everything else cannot lead to reasonable results.
2152     if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2153         rtl_isOctetTextEncoding( eEnc ) &&
2154         rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2155     {
2156         eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
2157         SetSrcEncoding( eEnc );
2158     }
2159
2160     return bRet;
2161 }
2162
2163 rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
2164 {
2165     OUString sType;
2166     OUString sSubType;
2167     INetContentTypeParameterList aParameters;
2168     if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
2169     {
2170         auto const iter = aParameters.find("charset"_ostr);
2171         if (iter != aParameters.end())
2172         {
2173             const INetContentTypeParameter * pCharset = &iter->second;
2174             OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
2175             return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
2176         }
2177     }
2178     return RTL_TEXTENCODING_DONTKNOW;
2179 }
2180
2181 rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2182 {
2183     rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2184     if( pHTTPHeader )
2185     {
2186         SvKeyValue aKV;
2187         for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2188              bCont = pHTTPHeader->GetNext( aKV ) )
2189         {
2190             if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2191             {
2192                 if( !aKV.GetValue().isEmpty() )
2193                 {
2194                     eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2195                 }
2196             }
2197         }
2198     }
2199     return eRet;
2200 }
2201
2202 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
2203 {
2204     bool bRet = false;
2205     rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2206     if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2207     {
2208         SetSrcEncoding( eEnc );
2209         bRet = true;
2210     }
2211     return bRet;
2212 }
2213
2214
2215 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */