Bump version to 5.0-14
[LibreOffice.git] / svtools / source / svhtml / parhtml.cxx
blob4ea4835aafb820f32a775f532ee8fc0ca6c6fcf9
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <ctype.h>
22 #include <comphelper/string.hxx>
23 #include <o3tl/ptr_container.hxx>
24 #include <tools/stream.hxx>
25 #include <tools/debug.hxx>
26 #include <tools/color.hxx>
27 #include <rtl/ustrbuf.hxx>
28 #include <rtl/strbuf.hxx>
30 #include <tools/tenccvt.hxx>
31 #include <tools/datetime.hxx>
32 #include <svl/inettype.hxx>
33 #include <com/sun/star/beans/PropertyAttribute.hpp>
34 #include <com/sun/star/document/XDocumentProperties.hpp>
36 #include <svtools/parhtml.hxx>
37 #include <svtools/htmltokn.h>
38 #include <svtools/htmlkywd.hxx>
40 #include <memory>
41 #include <utility>
43 using namespace ::com::sun::star;
46 const sal_Int32 MAX_LEN( 1024L );
48 const sal_Int32 MAX_ENTITY_LEN( 8L );
51 // Tables to convert option values into strings
53 // <INPUT TYPE=xxx>
54 static HTMLOptionEnum const aInputTypeOptEnums[] =
56 { OOO_STRING_SVTOOLS_HTML_IT_text, HTML_IT_TEXT },
57 { OOO_STRING_SVTOOLS_HTML_IT_password, HTML_IT_PASSWORD },
58 { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTML_IT_CHECKBOX },
59 { OOO_STRING_SVTOOLS_HTML_IT_radio, HTML_IT_RADIO },
60 { OOO_STRING_SVTOOLS_HTML_IT_range, HTML_IT_RANGE },
61 { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTML_IT_SCRIBBLE },
62 { OOO_STRING_SVTOOLS_HTML_IT_file, HTML_IT_FILE },
63 { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTML_IT_HIDDEN },
64 { OOO_STRING_SVTOOLS_HTML_IT_submit, HTML_IT_SUBMIT },
65 { OOO_STRING_SVTOOLS_HTML_IT_image, HTML_IT_IMAGE },
66 { OOO_STRING_SVTOOLS_HTML_IT_reset, HTML_IT_RESET },
67 { OOO_STRING_SVTOOLS_HTML_IT_button, HTML_IT_BUTTON },
68 { 0, 0 }
71 // <TABLE FRAME=xxx>
72 static HTMLOptionEnum const aTableFrameOptEnums[] =
74 { OOO_STRING_SVTOOLS_HTML_TF_void, HTML_TF_VOID },
75 { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE },
76 { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW },
77 { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTML_TF_HSIDES },
78 { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTML_TF_LHS },
79 { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTML_TF_RHS },
80 { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTML_TF_VSIDES },
81 { OOO_STRING_SVTOOLS_HTML_TF_box, HTML_TF_BOX },
82 { OOO_STRING_SVTOOLS_HTML_TF_border, HTML_TF_BOX },
83 { 0, 0 }
86 // <TABLE RULES=xxx>
87 static HTMLOptionEnum const aTableRulesOptEnums[] =
89 { OOO_STRING_SVTOOLS_HTML_TR_none, HTML_TR_NONE },
90 { OOO_STRING_SVTOOLS_HTML_TR_groups, HTML_TR_GROUPS },
91 { OOO_STRING_SVTOOLS_HTML_TR_rows, HTML_TR_ROWS },
92 { OOO_STRING_SVTOOLS_HTML_TR_cols, HTML_TR_COLS },
93 { OOO_STRING_SVTOOLS_HTML_TR_all, HTML_TR_ALL },
94 { 0, 0 }
97 sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const
99 sal_uInt16 nValue = nDflt;
101 while( pOptEnums->pName )
102 if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
103 break;
104 else
105 pOptEnums++;
107 if( pOptEnums->pName )
108 nValue = pOptEnums->nValue;
110 return nValue;
113 bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const
115 while( pOptEnums->pName )
117 if( aValue.equalsIgnoreAsciiCaseAscii( pOptEnums->pName ) )
118 break;
119 else
120 pOptEnums++;
123 const sal_Char *pName = pOptEnums->pName;
124 if( pName )
125 rEnum = pOptEnums->nValue;
127 return (pName != 0);
130 HTMLOption::HTMLOption( sal_uInt16 nTok, const OUString& rToken,
131 const OUString& rValue )
132 : aValue(rValue)
133 , aToken(rToken)
134 , nToken( nTok )
136 DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END,
137 "HTMLOption: unknown token" );
140 sal_uInt32 HTMLOption::GetNumber() const
142 DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START &&
143 nToken<HTML_OPTION_NUMBER_END) ||
144 (nToken>=HTML_OPTION_CONTEXT_START &&
145 nToken<HTML_OPTION_CONTEXT_END) ||
146 nToken==HTML_O_VALUE,
147 "GetNumber: Option not numerical" );
148 OUString aTmp(comphelper::string::stripStart(aValue, ' '));
149 sal_Int32 nTmp = aTmp.toInt32();
150 return nTmp >= 0 ? (sal_uInt32)nTmp : 0;
153 sal_Int32 HTMLOption::GetSNumber() const
155 DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) ||
156 (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END),
157 "GetSNumber: Option not numerical" );
158 OUString aTmp(comphelper::string::stripStart(aValue, ' '));
159 return aTmp.toInt32();
162 void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers, bool bSpaceDelim ) const
164 rNumbers.clear();
166 if( bSpaceDelim )
168 // This is a very simplified scanner: it only searches all
169 // numerals in the string.
170 bool bInNum = false;
171 sal_uLong nNum = 0;
172 for( sal_Int32 i=0; i<aValue.getLength(); i++ )
174 sal_Unicode c = aValue[ i ];
175 if( c>='0' && c<='9' )
177 nNum *= 10;
178 nNum += (c - '0');
179 bInNum = true;
181 else if( bInNum )
183 rNumbers.push_back( nNum );
184 bInNum = false;
185 nNum = 0;
188 if( bInNum )
190 rNumbers.push_back( nNum );
193 else
195 // Check whether numbers are separated by ',' and
196 // insert 0 if necessary
197 sal_Int32 nPos = 0;
198 while( nPos < aValue.getLength() )
200 sal_Unicode c;
201 while( nPos < aValue.getLength() &&
202 ((c=aValue[nPos]) == ' ' || c == '\t' ||
203 c == '\n' || c== '\r' ) )
204 nPos++;
206 if( nPos==aValue.getLength() )
207 rNumbers.push_back(0);
208 else
210 sal_Int32 nEnd = aValue.indexOf( (sal_Unicode)',', nPos );
211 if( -1 == nEnd )
213 sal_Int32 nTmp = aValue.copy(nPos).toInt32();
214 rNumbers.push_back( nTmp >= 0 ? (sal_uInt32)nTmp : 0 );
215 nPos = aValue.getLength();
217 else
219 sal_Int32 nTmp = aValue.copy(nPos,nEnd-nPos).toInt32();
220 rNumbers.push_back( nTmp >= 0 ? (sal_uInt32)nTmp : 0 );
221 nPos = nEnd+1;
228 void HTMLOption::GetColor( Color& rColor ) const
230 DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE,
231 "GetColor: Option is not a color." );
233 OUString aTmp(aValue.toAsciiLowerCase());
234 sal_uInt32 nColor = SAL_MAX_UINT32;
235 if (!aTmp.isEmpty() && aTmp[0] != '#')
236 nColor = GetHTMLColor(aTmp);
238 if( SAL_MAX_UINT32 == nColor )
240 nColor = 0;
241 sal_Int32 nPos = 0;
242 for (sal_uInt32 i=0; i<6; ++i)
244 // Whatever Netscape does to get color values,
245 // at maximum three characters < '0' are ignored.
246 sal_Unicode c = nPos<aTmp.getLength() ? aTmp[ nPos++ ] : '0';
247 if( c < '0' )
249 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
250 if( c < '0' )
251 c = nPos<aTmp.getLength() ? aTmp[nPos++] : '0';
253 nColor *= 16;
254 if( c >= '0' && c <= '9' )
255 nColor += (c - '0');
256 else if( c >= 'a' && c <= 'f' )
257 nColor += (c + 0xa - 'a');
261 rColor.SetRed( (sal_uInt8)((nColor & 0x00ff0000) >> 16) );
262 rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8));
263 rColor.SetBlue( (sal_uInt8)(nColor & 0x000000ff) );
266 HTMLInputType HTMLOption::GetInputType() const
268 DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option not TYPE" );
269 return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT );
272 HTMLTableFrame HTMLOption::GetTableFrame() const
274 DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option not FRAME" );
275 return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID );
278 HTMLTableRules HTMLOption::GetTableRules() const
280 DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option not RULES" );
281 return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE );
284 HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
285 SvParser( rIn ),
286 bNewDoc(bReadNewDoc),
287 bIsInHeader(true),
288 bIsInBody(false),
289 bReadListing(false),
290 bReadXMP(false),
291 bReadPRE(false),
292 bReadTextArea(false),
293 bReadScript(false),
294 bReadStyle(false),
295 bEndTokenFound(false),
296 bPre_IgnoreNewPara(false),
297 bReadNextChar(false),
298 bReadComment(false),
299 nPre_LinePos(0),
300 mnPendingOffToken(0)
302 //#i76649, default to UTF-8 for HTML unless we know differently
303 SetSrcEncoding(RTL_TEXTENCODING_UTF8);
306 HTMLParser::~HTMLParser()
310 SvParserState HTMLParser::CallParser()
312 eState = SVPAR_WORKING;
313 nNextCh = GetNextChar();
314 SaveState( 0 );
316 nPre_LinePos = 0;
317 bPre_IgnoreNewPara = false;
319 AddFirstRef();
320 Continue( 0 );
321 if( SVPAR_PENDING != eState )
322 ReleaseRef(); // Parser not needed anymore
324 return eState;
327 void HTMLParser::Continue( int nToken )
329 if( !nToken )
330 nToken = GetNextToken();
332 while( IsParserWorking() )
334 SaveState( nToken );
335 nToken = FilterToken( nToken );
337 if( nToken )
338 NextToken( nToken );
340 if( IsParserWorking() )
341 SaveState( 0 ); // continue with new token
343 nToken = GetNextToken();
347 int HTMLParser::FilterToken( int nToken )
349 switch( nToken )
351 case sal_Unicode(EOF):
352 nToken = 0;
353 break; // don't pass
355 case HTML_HEAD_OFF:
356 bIsInBody = true;
357 bIsInHeader = false;
358 break;
360 case HTML_HEAD_ON:
361 bIsInHeader = true;
362 break;
364 case HTML_BODY_ON:
365 bIsInHeader = false;
366 bIsInBody = true;
367 break;
369 case HTML_FRAMESET_ON:
370 bIsInHeader = false;
371 bIsInBody = false;
372 break;
374 case HTML_BODY_OFF:
375 bIsInBody = bReadPRE = bReadListing = bReadXMP = false;
376 break;
378 case HTML_HTML_OFF:
379 nToken = 0;
380 bReadPRE = bReadListing = bReadXMP = false;
381 break; // HTML_ON hasn't been passed either !
383 case HTML_PREFORMTXT_ON:
384 StartPRE();
385 break;
387 case HTML_PREFORMTXT_OFF:
388 FinishPRE();
389 break;
391 case HTML_LISTING_ON:
392 StartListing();
393 break;
395 case HTML_LISTING_OFF:
396 FinishListing();
397 break;
399 case HTML_XMP_ON:
400 StartXMP();
401 break;
403 case HTML_XMP_OFF:
404 FinishXMP();
405 break;
407 default:
408 if( bReadPRE )
409 nToken = FilterPRE( nToken );
410 else if( bReadListing )
411 nToken = FilterListing( nToken );
412 else if( bReadXMP )
413 nToken = FilterXMP( nToken );
415 break;
418 return nToken;
421 #define HTML_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
422 #define HTML_ISALPHA( c ) comphelper::string::isalphaAscii(c)
423 #define HTML_ISALNUM( c ) comphelper::string::isalnumAscii(c)
424 #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) )
425 #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127)
426 #define HTML_ISHEXDIGIT( c ) comphelper::string::isxdigitAscii(c)
428 int HTMLParser::ScanText( const sal_Unicode cBreak )
430 OUStringBuffer sTmpBuffer( MAX_LEN );
431 bool bContinue = true;
432 bool bEqSignFound = false;
433 sal_Unicode cQuote = 0U;
435 while( bContinue && IsParserWorking() )
437 bool bNextCh = true;
438 switch( nNextCh )
440 case '&':
441 bEqSignFound = false;
442 if( bReadXMP )
443 sTmpBuffer.append( '&' );
444 else
446 sal_uLong nStreamPos = rInput.Tell();
447 sal_uLong nLinePos = GetLinePos();
449 sal_Unicode cChar = 0U;
450 if( '#' == (nNextCh = GetNextChar()) )
452 nNextCh = GetNextChar();
453 const bool bIsHex( 'x' == nNextCh );
454 const bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) );
455 if ( bIsDecOrHex )
457 if ( bIsHex )
459 nNextCh = GetNextChar();
460 while ( HTML_ISHEXDIGIT(nNextCh) )
462 cChar = cChar * 16U +
463 ( nNextCh <= '9'
464 ? sal_Unicode( nNextCh - '0' )
465 : ( nNextCh <= 'F'
466 ? sal_Unicode( nNextCh - 'A' + 10 )
467 : sal_Unicode( nNextCh - 'a' + 10 ) ) );
468 nNextCh = GetNextChar();
471 else
475 cChar = cChar * 10U + sal_Unicode( nNextCh - '0');
476 nNextCh = GetNextChar();
478 while( HTML_ISDIGIT(nNextCh) );
481 if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
482 RTL_TEXTENCODING_UCS2 != eSrcEnc &&
483 RTL_TEXTENCODING_UTF8 != eSrcEnc &&
484 cChar < 256 )
486 const sal_uInt32 convertFlags =
487 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
488 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
489 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
491 sal_Char cEncodedChar = static_cast<sal_Char>(cChar);
492 cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
493 if( 0U == cChar )
495 // If the character could not be
496 // converted, because a conversion is not
497 // available, do no conversion at all.
498 cChar = cEncodedChar;
502 else
503 nNextCh = 0U;
505 else if( HTML_ISALPHA( nNextCh ) )
507 OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
508 sal_Int32 nPos = 0L;
511 sEntityBuffer.append( nNextCh );
512 nPos++;
513 nNextCh = GetNextChar();
515 while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) &&
516 !rInput.IsEof() );
518 if( IsParserWorking() && !rInput.IsEof() )
520 OUString sEntity(sEntityBuffer.getStr(), nPos);
521 cChar = GetHTMLCharName( sEntity );
523 // not found ( == 0 ): plain text
524 // or a character which is inserted as attribute
525 if( 0U == cChar && ';' != nNextCh )
527 DBG_ASSERT( rInput.Tell() - nStreamPos ==
528 (sal_uLong)(nPos+1L)*GetCharSize(),
529 "UTF-8 is failing here" );
530 for( sal_Int32 i = nPos-1; i>1; i-- )
532 nNextCh = sEntityBuffer[i];
533 sEntityBuffer.setLength( i );
534 sEntity = OUString(sEntityBuffer.getStr(), i);
535 cChar = GetHTMLCharName( sEntity );
536 if( cChar )
538 rInput.SeekRel( -(long)
539 ((nPos-i)*GetCharSize()) );
540 nlLinePos -= sal_uInt32(nPos-i);
541 nPos = i;
542 ClearTxtConvContext();
543 break;
548 if( !cChar ) // unknown character?
550 // back in stream, insert '&'
551 // and restart with next character
552 sTmpBuffer.append( '&' );
554 DBG_ASSERT( rInput.Tell()-nStreamPos ==
555 (sal_uLong)(nPos+1)*GetCharSize(),
556 "Wrong stream position" );
557 DBG_ASSERT( nlLinePos-nLinePos ==
558 (sal_uLong)(nPos+1),
559 "Wrong line position" );
560 rInput.Seek( nStreamPos );
561 nlLinePos = nLinePos;
562 ClearTxtConvContext();
563 break;
566 assert(cChar != 0);
568 // 1 == Non Breaking Space
569 // 2 == SoftHyphen
571 if (cChar == 1 || cChar == 2)
573 if( '>' == cBreak )
575 // When reading the content of a tag we have
576 // to change it to ' ' or '-'
577 if( 1U == cChar )
578 cChar = ' ';
579 else //2U
580 cChar = '-';
582 else
584 // If not scanning a tag return token
585 aToken += sTmpBuffer.makeStringAndClear();
587 if( !aToken.isEmpty() )
589 // restart with character
590 nNextCh = '&';
591 DBG_ASSERT( rInput.Tell()-nStreamPos ==
592 (sal_uLong)(nPos+1)*GetCharSize(),
593 "Wrong stream position" );
594 DBG_ASSERT( nlLinePos-nLinePos ==
595 (sal_uLong)(nPos+1),
596 "Wrong line position" );
597 rInput.Seek( nStreamPos );
598 nlLinePos = nLinePos;
599 ClearTxtConvContext();
600 return HTML_TEXTTOKEN;
603 // Hack: _GetNextChar shall not read the
604 // next character
605 if( ';' != nNextCh )
606 aToken += " ";
607 if( 1U == cChar )
608 return HTML_NONBREAKSPACE;
609 else //2U
610 return HTML_SOFTHYPH;
614 else
615 nNextCh = 0U;
617 // &{...};-JavaScript-Macros are not supported any longer.
618 else if( IsParserWorking() )
620 sTmpBuffer.append( '&' );
621 bNextCh = false;
622 break;
625 bNextCh = (';' == nNextCh);
626 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
627 cChar=='\"' || cChar==' ') )
629 // ' and " have to be escaped within tags to separate
630 // them from ' and " enclosing options.
631 // \ has to be escaped as well.
632 // Space is protected because it's not a delimiter between
633 // options.
634 sTmpBuffer.append( '\\' );
635 if( MAX_LEN == sTmpBuffer.getLength() )
636 aToken += sTmpBuffer.makeStringAndClear();
638 if( IsParserWorking() )
640 if( cChar )
641 sTmpBuffer.append( cChar );
643 else if( SVPAR_PENDING==eState && '>'!=cBreak )
645 // Restart with '&', the remainder is returned as
646 // text token.
647 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
649 // _GetNextChar() returns the previous text and
650 // during the next execution a new character is read.
651 // Thus we have to position in front of the '&'.
652 nNextCh = 0U;
653 rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() );
654 nlLinePos = nLinePos-1;
655 ClearTxtConvContext();
656 bReadNextChar = true;
658 bNextCh = false;
661 break;
662 case '=':
663 if( '>'==cBreak && !cQuote )
664 bEqSignFound = true;
665 sTmpBuffer.append( nNextCh );
666 break;
668 case '\\':
669 if( '>'==cBreak )
671 // Innerhalb von Tags kennzeichnen
672 sTmpBuffer.append( '\\' );
673 if( MAX_LEN == sTmpBuffer.getLength() )
674 aToken += sTmpBuffer.makeStringAndClear();
676 sTmpBuffer.append( '\\' );
677 break;
679 case '\"':
680 case '\'':
681 if( '>'==cBreak )
683 if( bEqSignFound )
684 cQuote = nNextCh;
685 else if( cQuote && (cQuote==nNextCh ) )
686 cQuote = 0U;
688 sTmpBuffer.append( nNextCh );
689 bEqSignFound = false;
690 break;
692 case sal_Unicode(EOF):
693 if( rInput.IsEof() )
695 bContinue = false;
697 else
699 sTmpBuffer.append( nNextCh );
701 break;
703 case '<':
704 bEqSignFound = false;
705 if( '>'==cBreak )
706 sTmpBuffer.append( nNextCh );
707 else
708 bContinue = false; // break, String zusammen
709 break;
711 case '\f':
712 if( '>' == cBreak )
714 // If scanning options treat it like a space, ...
715 sTmpBuffer.append( ' ' );
717 else
719 // otherwise it's a separate token.
720 bContinue = false;
722 break;
724 case '\r':
725 case '\n':
726 if( '>'==cBreak )
728 // cr/lf in tag is handled in _GetNextToken()
729 sTmpBuffer.append( nNextCh );
730 break;
732 else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
734 bContinue = false;
735 break;
737 // Reduce sequence of CR/LF/BLANK/TAB to a single blank
738 // no break!!
739 case '\t':
740 if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
742 // Pass Tabs up in <PRE>
743 bContinue = false;
744 break;
746 // no break
747 case '\x0b':
748 if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
749 '>'!=cBreak )
751 break;
753 nNextCh = ' ';
754 // no break;
755 case ' ':
756 sTmpBuffer.append( nNextCh );
757 if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
758 !bReadPRE && !bReadTextArea) )
760 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
761 do {
762 if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
763 rInput.IsEof() )
765 if( !aToken.isEmpty() || sTmpBuffer.getLength() > 1L )
767 // Have seen s.th. aside from blanks?
768 aToken += sTmpBuffer.makeStringAndClear();
769 return HTML_TEXTTOKEN;
771 else
772 // Only read blanks: no text must be returned
773 // and _GetNextToken has to read until EOF
774 return 0;
776 } while ( ' ' == nNextCh || '\t' == nNextCh ||
777 '\r' == nNextCh || '\n' == nNextCh ||
778 '\x0b' == nNextCh );
779 bNextCh = false;
781 break;
783 default:
784 bEqSignFound = false;
785 if (nNextCh == cBreak && !cQuote)
786 bContinue = false;
787 else
789 do {
790 // All remaining characters make their way into the text.
791 sTmpBuffer.append( nNextCh );
792 if( MAX_LEN == sTmpBuffer.getLength() )
794 aToken += sTmpBuffer.makeStringAndClear();
796 if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
797 rInput.IsEof() ) ||
798 !IsParserWorking() )
800 if( !sTmpBuffer.isEmpty() )
801 aToken += sTmpBuffer.makeStringAndClear();
802 return HTML_TEXTTOKEN;
804 } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) );
805 bNextCh = false;
809 if( MAX_LEN == sTmpBuffer.getLength() )
810 aToken += sTmpBuffer.makeStringAndClear();
812 if( bContinue && bNextCh )
813 nNextCh = GetNextChar();
816 if( !sTmpBuffer.isEmpty() )
817 aToken += sTmpBuffer.makeStringAndClear();
819 return HTML_TEXTTOKEN;
822 int HTMLParser::_GetNextRawToken()
824 OUStringBuffer sTmpBuffer( MAX_LEN );
826 if( bEndTokenFound )
828 // During the last execution we already found the end token,
829 // thus we don't have to search it again.
830 bReadScript = false;
831 bReadStyle = false;
832 aEndToken.clear();
833 bEndTokenFound = false;
835 return 0;
838 // Default return value: HTML_RAWDATA
839 bool bContinue = true;
840 int nToken = HTML_RAWDATA;
841 SaveState( 0 );
842 while( bContinue && IsParserWorking() )
844 bool bNextCh = true;
845 switch( nNextCh )
847 case '<':
849 // Maybe we've reached the end.
851 // Save what we have read previously...
852 aToken += sTmpBuffer.makeStringAndClear();
854 // and remember position in stream.
855 sal_uLong nStreamPos = rInput.Tell();
856 sal_uLong nLineNr = GetLineNr();
857 sal_uLong nLinePos = GetLinePos();
859 // Start of an end token?
860 bool bOffState = false;
861 if( '/' == (nNextCh = GetNextChar()) )
863 bOffState = true;
864 nNextCh = GetNextChar();
866 else if( '!' == nNextCh )
868 sTmpBuffer.append( nNextCh );
869 nNextCh = GetNextChar();
872 // Read following letters
873 while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) &&
874 IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
876 sTmpBuffer.append( nNextCh );
877 nNextCh = GetNextChar();
880 OUString aTok( sTmpBuffer.toString() );
881 aTok = aTok.toAsciiLowerCase();
882 bool bDone = false;
883 if( bReadScript || !aEndToken.isEmpty() )
885 if( !bReadComment )
887 if( aTok.startsWith( OOO_STRING_SVTOOLS_HTML_comment ) )
889 bReadComment = true;
891 else
893 // A script has to end with "</SCRIPT>". But
894 // ">" is optional for security reasons
895 bDone = bOffState &&
896 ( bReadScript
897 ? aTok == OOO_STRING_SVTOOLS_HTML_script
898 : aTok.equals(aEndToken) );
901 if( bReadComment && '>'==nNextCh && aTok.endsWith( "--" ) )
903 // End of comment of style <!----->
904 bReadComment = false;
907 else
909 // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
910 if( bOffState )
911 bDone = aTok == OOO_STRING_SVTOOLS_HTML_style ||
912 aTok == OOO_STRING_SVTOOLS_HTML_head;
913 else
914 bDone = aTok == OOO_STRING_SVTOOLS_HTML_body;
917 if( bDone )
919 // Done! Return the previously read string (if requested)
920 // and continue.
922 bContinue = false;
924 // nToken==0 means, _GetNextToken continues to read
925 if( aToken.isEmpty() && (bReadStyle || bReadScript) )
927 // Immediately close environment (or context?)
928 // and parse the end token
929 bReadScript = false;
930 bReadStyle = false;
931 aEndToken.clear();
932 nToken = 0;
934 else
936 // Keep bReadScript/bReadStyle alive
937 // and parse end token during next execution
938 bEndTokenFound = true;
941 // Move backwards in stream to '<'
942 rInput.Seek( nStreamPos );
943 SetLineNr( nLineNr );
944 SetLinePos( nLinePos );
945 ClearTxtConvContext();
946 nNextCh = '<';
948 // Don't append string to token.
949 sTmpBuffer.setLength( 0L );
951 else
953 // remember "</" , everything else we find in the buffer
954 aToken += "<";
955 if( bOffState )
956 aToken += "/";
958 bNextCh = false;
961 break;
962 case '-':
963 sTmpBuffer.append( nNextCh );
964 if( bReadComment )
966 bool bTwoMinus = false;
967 nNextCh = GetNextChar();
968 while( '-' == nNextCh && IsParserWorking() )
970 bTwoMinus = true;
972 if( MAX_LEN == sTmpBuffer.getLength() )
973 aToken += sTmpBuffer.makeStringAndClear();
974 sTmpBuffer.append( nNextCh );
975 nNextCh = GetNextChar();
978 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
979 bReadComment = false;
981 bNextCh = false;
983 break;
985 case '\r':
986 // \r\n? closes the current text token (even if it's empty)
987 nNextCh = GetNextChar();
988 if( nNextCh=='\n' )
989 nNextCh = GetNextChar();
990 bContinue = false;
991 break;
992 case '\n':
993 // \n closes the current text token (even if it's empty)
994 nNextCh = GetNextChar();
995 bContinue = false;
996 break;
997 case sal_Unicode(EOF):
998 // eof closes the current text token and behaves like having read
999 // an end token
1000 if( rInput.IsEof() )
1002 bContinue = false;
1003 if( !aToken.isEmpty() || !sTmpBuffer.isEmpty() )
1005 bEndTokenFound = true;
1007 else
1009 bReadScript = false;
1010 bReadStyle = false;
1011 aEndToken.clear();
1012 nToken = 0;
1014 break;
1016 // no break
1017 default:
1018 // all remaining characters are appended to the buffer
1019 sTmpBuffer.append( nNextCh );
1020 break;
1023 if( (!bContinue && !sTmpBuffer.isEmpty()) ||
1024 MAX_LEN == sTmpBuffer.getLength() )
1025 aToken += sTmpBuffer.makeStringAndClear();
1027 if( bContinue && bNextCh )
1028 nNextCh = GetNextChar();
1031 if( IsParserWorking() )
1032 SaveState( 0 );
1033 else
1034 nToken = 0;
1036 return nToken;
1039 // Scan next token
1040 int HTMLParser::_GetNextToken()
1042 int nRet = 0;
1043 sSaveToken.clear();
1045 if (mnPendingOffToken)
1047 // HTML_<TOKEN>_OFF generated for HTML_<TOKEN>_ON
1048 nRet = mnPendingOffToken;
1049 mnPendingOffToken = 0;
1050 aToken.clear();
1051 return nRet;
1054 // Delete options
1055 if (!maOptions.empty())
1056 maOptions.clear();
1058 if( !IsParserWorking() ) // Don't continue if already an error occurred
1059 return 0;
1061 bool bReadNextCharSave = bReadNextChar;
1062 if( bReadNextChar )
1064 DBG_ASSERT( !bEndTokenFound,
1065 "Read a character despite </SCRIPT> was read?" );
1066 nNextCh = GetNextChar();
1067 if( !IsParserWorking() ) // Don't continue if already an error occurred
1068 return 0;
1069 bReadNextChar = false;
1072 if( bReadScript || bReadStyle || !aEndToken.isEmpty() )
1074 nRet = _GetNextRawToken();
1075 if( nRet || !IsParserWorking() )
1076 return nRet;
1079 do {
1080 bool bNextCh = true;
1081 switch( nNextCh )
1083 case '<':
1085 sal_uLong nStreamPos = rInput.Tell();
1086 sal_uLong nLineNr = GetLineNr();
1087 sal_uLong nLinePos = GetLinePos();
1089 bool bOffState = false;
1090 if( '/' == (nNextCh = GetNextChar()) )
1092 bOffState = true;
1093 nNextCh = GetNextChar();
1095 if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh )
1097 OUStringBuffer sTmpBuffer;
1098 do {
1099 sTmpBuffer.append( nNextCh );
1100 if( MAX_LEN == sTmpBuffer.getLength() )
1101 aToken += sTmpBuffer.makeStringAndClear();
1102 nNextCh = GetNextChar();
1103 } while( '>' != nNextCh && '/' != nNextCh && !HTML_ISSPACE( nNextCh ) &&
1104 IsParserWorking() && !rInput.IsEof() );
1106 if( !sTmpBuffer.isEmpty() )
1107 aToken += sTmpBuffer.makeStringAndClear();
1109 // Skip blanks
1110 while( HTML_ISSPACE( nNextCh ) && IsParserWorking() )
1111 nNextCh = GetNextChar();
1113 if( !IsParserWorking() )
1115 if( SVPAR_PENDING == eState )
1116 bReadNextChar = bReadNextCharSave;
1117 break;
1120 // Search token in table:
1121 sSaveToken = aToken;
1122 aToken = aToken.toAsciiLowerCase();
1123 if( 0 == (nRet = GetHTMLToken( aToken )) )
1124 // Unknown control
1125 nRet = HTML_UNKNOWNCONTROL_ON;
1127 // If it's a token which can be switched off...
1128 if( bOffState )
1130 if( HTML_TOKEN_ONOFF & nRet )
1132 // and there is an off token, return off token instead
1133 ++nRet;
1135 else if( HTML_LINEBREAK!=nRet )
1137 // and there is no off token, return unknown token.
1138 // (except for </BR>, that is treated like <BR>)
1139 nRet = HTML_UNKNOWNCONTROL_OFF;
1143 if( nRet == HTML_COMMENT )
1145 // fix: due to being case sensitive use sSaveToken as start of comment
1146 // and append a blank.
1147 aToken = sSaveToken;
1148 if( '>'!=nNextCh )
1149 aToken += " ";
1150 sal_uLong nCStreamPos = 0;
1151 sal_uLong nCLineNr = 0;
1152 sal_uLong nCLinePos = 0;
1153 sal_Int32 nCStrLen = 0;
1155 bool bDone = false;
1156 // Read until closing -->. If not found restart at first >
1157 while( !bDone && !rInput.IsEof() && IsParserWorking() )
1159 if( '>'==nNextCh )
1161 if( !nCStreamPos )
1163 nCStreamPos = rInput.Tell();
1164 nCStrLen = aToken.getLength();
1165 nCLineNr = GetLineNr();
1166 nCLinePos = GetLinePos();
1168 bDone = aToken.endsWith( "--" );
1169 if( !bDone )
1170 aToken += OUString(nNextCh);
1172 else
1173 aToken += OUString(nNextCh);
1174 if( !bDone )
1175 nNextCh = GetNextChar();
1177 if( !bDone && IsParserWorking() && nCStreamPos )
1179 rInput.Seek( nCStreamPos );
1180 SetLineNr( nCLineNr );
1181 SetLinePos( nCLinePos );
1182 ClearTxtConvContext();
1183 aToken = aToken.copy(0, nCStrLen);
1184 nNextCh = '>';
1187 else
1189 // TokenString not needed anymore
1190 aToken.clear();
1193 // Read until closing '>'
1194 if( '>' != nNextCh && IsParserWorking() )
1196 ScanText( '>' );
1198 // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1199 // generate pending HTML_<TOKEN>_OFF for HTML_<TOKEN>_ON
1200 // Do not convert this to a single HTML_<TOKEN>_OFF
1201 // which lead to fdo#56772.
1202 if ((HTML_TOKEN_ONOFF & nRet) && aToken.endsWith("/"))
1204 mnPendingOffToken = nRet + 1; // HTML_<TOKEN>_ON -> HTML_<TOKEN>_OFF
1205 aToken = aToken.replaceAt( aToken.getLength()-1, 1, ""); // remove trailing '/'
1207 if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1209 // Move back in front of < and restart there.
1210 // Return < as text.
1211 rInput.Seek( nStreamPos );
1212 SetLineNr( nLineNr );
1213 SetLinePos( nLinePos );
1214 ClearTxtConvContext();
1216 aToken = "<";
1217 nRet = HTML_TEXTTOKEN;
1218 nNextCh = GetNextChar();
1219 bNextCh = false;
1220 break;
1223 if( SVPAR_PENDING == eState )
1224 bReadNextChar = bReadNextCharSave;
1226 else
1228 if( bOffState )
1230 // einfach alles wegschmeissen
1231 ScanText( '>' );
1232 if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1234 // Move back in front of < and restart there.
1235 // Return < as text.
1236 rInput.Seek( nStreamPos );
1237 SetLineNr( nLineNr );
1238 SetLinePos( nLinePos );
1239 ClearTxtConvContext();
1241 aToken = "<";
1242 nRet = HTML_TEXTTOKEN;
1243 nNextCh = GetNextChar();
1244 bNextCh = false;
1245 break;
1247 if( SVPAR_PENDING == eState )
1248 bReadNextChar = bReadNextCharSave;
1249 aToken.clear();
1251 else if( '%' == nNextCh )
1253 nRet = HTML_UNKNOWNCONTROL_ON;
1255 sal_uLong nCStreamPos = rInput.Tell();
1256 sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1258 bool bDone = false;
1259 // Read until closing %>. If not found restart at first >.
1260 while( !bDone && !rInput.IsEof() && IsParserWorking() )
1262 bDone = '>'==nNextCh && aToken.endsWith("%");
1263 if( !bDone )
1265 aToken += OUString(nNextCh);
1266 nNextCh = GetNextChar();
1269 if( !bDone && IsParserWorking() )
1271 rInput.Seek( nCStreamPos );
1272 SetLineNr( nCLineNr );
1273 SetLinePos( nCLinePos );
1274 ClearTxtConvContext();
1275 aToken = "<%";
1276 nRet = HTML_TEXTTOKEN;
1277 break;
1279 if( IsParserWorking() )
1281 sSaveToken = aToken;
1282 aToken.clear();
1285 else
1287 aToken = "<";
1288 nRet = HTML_TEXTTOKEN;
1289 bNextCh = false;
1290 break;
1294 if( IsParserWorking() )
1296 bNextCh = '>' == nNextCh;
1297 switch( nRet )
1299 case HTML_TEXTAREA_ON:
1300 bReadTextArea = true;
1301 break;
1302 case HTML_TEXTAREA_OFF:
1303 bReadTextArea = false;
1304 break;
1305 case HTML_SCRIPT_ON:
1306 if( !bReadTextArea )
1307 bReadScript = true;
1308 break;
1309 case HTML_SCRIPT_OFF:
1310 if( !bReadTextArea )
1312 bReadScript = false;
1313 // JavaScript might modify the stream,
1314 // thus the last character has to be read again.
1315 bReadNextChar = true;
1316 bNextCh = false;
1318 break;
1320 case HTML_STYLE_ON:
1321 bReadStyle = true;
1322 break;
1323 case HTML_STYLE_OFF:
1324 bReadStyle = false;
1325 break;
1329 break;
1331 case sal_Unicode(EOF):
1332 if( rInput.IsEof() )
1334 eState = SVPAR_ACCEPTED;
1335 nRet = nNextCh;
1337 else
1339 // Read normal text.
1340 goto scan_text;
1342 break;
1344 case '\f':
1345 // form feeds are passed upwards separately
1346 nRet = HTML_LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
1347 break;
1349 case '\n':
1350 case '\r':
1351 if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1353 sal_Unicode c = GetNextChar();
1354 if( ( '\n' != nNextCh || '\r' != c ) &&
1355 ( '\r' != nNextCh || '\n' != c ) )
1357 bNextCh = false;
1358 nNextCh = c;
1360 nRet = HTML_NEWPARA;
1361 break;
1363 // no break !
1364 case '\t':
1365 if( bReadPRE )
1367 nRet = HTML_TABCHAR;
1368 break;
1370 // no break !
1371 case ' ':
1372 // no break !
1373 default:
1375 scan_text:
1376 // "normal" text to come
1377 nRet = ScanText();
1378 bNextCh = 0 == aToken.getLength();
1380 // the text should be processed
1381 if( !bNextCh && eState == SVPAR_PENDING )
1383 eState = SVPAR_WORKING;
1384 bReadNextChar = true;
1387 break;
1390 if( bNextCh && SVPAR_WORKING == eState )
1392 nNextCh = GetNextChar();
1393 if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet )
1395 bReadNextChar = true;
1396 eState = SVPAR_WORKING;
1400 } while( !nRet && SVPAR_WORKING == eState );
1402 if( SVPAR_PENDING == eState )
1403 nRet = -1; // s.th. invalid
1405 return nRet;
1408 void HTMLParser::UnescapeToken()
1410 sal_Int32 nPos=0;
1412 bool bEscape = false;
1413 while( nPos < aToken.getLength() )
1415 bool bOldEscape = bEscape;
1416 bEscape = false;
1417 if( '\\'==aToken[nPos] && !bOldEscape )
1419 aToken = aToken.replaceAt( nPos, 1, "" );
1420 bEscape = true;
1422 else
1424 nPos++;
1429 const HTMLOptions& HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken )
1431 // If the options for the current token have already been returned,
1432 // return them once again.
1433 if (!maOptions.empty())
1434 return maOptions;
1436 sal_Int32 nPos = 0;
1437 while( nPos < aToken.getLength() )
1439 // A letter? Option beginning here.
1440 if( HTML_ISALPHA( aToken[nPos] ) )
1442 int nToken;
1443 OUString aValue;
1444 sal_Int32 nStt = nPos;
1445 sal_Unicode cChar = 0;
1447 // Actually only certain characters allowed.
1448 // Netscape only looks for "=" and white space (c.f.
1449 // Mozilla: PA_FetchRequestedNameValues in lipparse/pa_mdl.c)
1450 while( nPos < aToken.getLength() && '=' != (cChar=aToken[nPos]) &&
1451 HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) )
1452 nPos++;
1454 OUString sName( aToken.copy( nStt, nPos-nStt ) );
1456 // PlugIns require original token name. Convert to lower case only for searching.
1457 nToken = GetHTMLOption( sName.toAsciiLowerCase() ); // Name is ready
1458 DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN,
1459 "GetOption: unknown HTML option" );
1460 bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START ||
1461 nToken >= HTML_OPTION_SCRIPT_END) &&
1462 (!pNoConvertToken || nToken != *pNoConvertToken);
1464 while( nPos < aToken.getLength() &&
1465 ( !HTML_ISPRINTABLE( (cChar=aToken[nPos]) ) ||
1466 HTML_ISSPACE(cChar) ) )
1467 nPos++;
1469 // Option with value?
1470 if( nPos!=aToken.getLength() && '='==cChar )
1472 nPos++;
1474 while( nPos < aToken.getLength() &&
1475 ( !HTML_ISPRINTABLE( (cChar=aToken[nPos]) ) ||
1476 ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) )
1477 nPos++;
1479 if( nPos != aToken.getLength() )
1481 sal_Int32 nLen = 0;
1482 nStt = nPos;
1483 if( ('"'==cChar) || ('\'')==cChar )
1485 sal_Unicode cEnd = cChar;
1486 nPos++; nStt++;
1487 bool bDone = false;
1488 bool bEscape = false;
1489 while( nPos < aToken.getLength() && !bDone )
1491 bool bOldEscape = bEscape;
1492 bEscape = false;
1493 cChar = aToken[nPos];
1494 switch( cChar )
1496 case '\r':
1497 case '\n':
1498 if( bStripCRLF )
1499 aToken = aToken.replaceAt( nPos, 1, "" );
1500 else
1501 nPos++, nLen++;
1502 break;
1503 case '\\':
1504 if( bOldEscape )
1506 nPos++, nLen++;
1508 else
1510 aToken = aToken.replaceAt( nPos, 1, "" );
1511 bEscape = true;
1513 break;
1514 case '"':
1515 case '\'':
1516 bDone = !bOldEscape && cChar==cEnd;
1517 if( !bDone )
1518 nPos++, nLen++;
1519 break;
1520 default:
1521 nPos++, nLen++;
1522 break;
1525 if( nPos!=aToken.getLength() )
1526 nPos++;
1528 else
1530 // More liberal than the standard: allow all printable characters
1531 bool bEscape = false;
1532 bool bDone = false;
1533 while( nPos < aToken.getLength() && !bDone )
1535 bool bOldEscape = bEscape;
1536 bEscape = false;
1537 sal_Unicode c = aToken[nPos];
1538 switch( c )
1540 case ' ':
1541 bDone = !bOldEscape;
1542 if( !bDone )
1543 nPos++, nLen++;
1544 break;
1546 case '\t':
1547 case '\r':
1548 case '\n':
1549 bDone = true;
1550 break;
1552 case '\\':
1553 if( bOldEscape )
1555 nPos++, nLen++;
1557 else
1559 aToken = aToken.replaceAt( nPos, 1, "" );
1560 bEscape = true;
1562 break;
1564 default:
1565 if( HTML_ISPRINTABLE( c ) )
1566 nPos++, nLen++;
1567 else
1568 bDone = true;
1569 break;
1574 if( nLen )
1575 aValue = aToken.copy( nStt, nLen );
1579 // Token is known and can be saved
1580 std::unique_ptr<HTMLOption> pOption(
1581 new HTMLOption(sal::static_int_cast<sal_uInt16>(nToken), sName, aValue));
1583 o3tl::ptr_container::push_back(maOptions, std::move(pOption));
1585 else
1586 // Ignore white space and unexpected characters
1587 nPos++;
1590 return maOptions;
1593 int HTMLParser::FilterPRE( int nToken )
1595 switch( nToken )
1597 // in Netscape they only have impact in not empty paragraphs
1598 case HTML_PARABREAK_ON:
1599 nToken = HTML_LINEBREAK;
1600 //fall-through
1601 case HTML_LINEBREAK:
1602 case HTML_NEWPARA:
1603 nPre_LinePos = 0;
1604 if( bPre_IgnoreNewPara )
1605 nToken = 0;
1606 break;
1608 case HTML_TABCHAR:
1610 sal_Int32 nSpaces = (8 - (nPre_LinePos % 8));
1611 DBG_ASSERT( aToken.isEmpty(), "Why is the token not empty?" );
1612 if (aToken.getLength() < nSpaces)
1614 using comphelper::string::padToLength;
1615 OUStringBuffer aBuf(aToken);
1616 aToken = padToLength(aBuf, nSpaces, ' ').makeStringAndClear();
1618 nPre_LinePos += nSpaces;
1619 nToken = HTML_TEXTTOKEN;
1621 break;
1622 // Keep those
1623 case HTML_TEXTTOKEN:
1624 nPre_LinePos += aToken.getLength();
1625 break;
1627 case HTML_SELECT_ON:
1628 case HTML_SELECT_OFF:
1629 case HTML_BODY_ON:
1630 case HTML_FORM_ON:
1631 case HTML_FORM_OFF:
1632 case HTML_INPUT:
1633 case HTML_OPTION:
1634 case HTML_TEXTAREA_ON:
1635 case HTML_TEXTAREA_OFF:
1637 case HTML_IMAGE:
1638 case HTML_APPLET_ON:
1639 case HTML_APPLET_OFF:
1640 case HTML_PARAM:
1641 case HTML_EMBED:
1643 case HTML_HEAD1_ON:
1644 case HTML_HEAD1_OFF:
1645 case HTML_HEAD2_ON:
1646 case HTML_HEAD2_OFF:
1647 case HTML_HEAD3_ON:
1648 case HTML_HEAD3_OFF:
1649 case HTML_HEAD4_ON:
1650 case HTML_HEAD4_OFF:
1651 case HTML_HEAD5_ON:
1652 case HTML_HEAD5_OFF:
1653 case HTML_HEAD6_ON:
1654 case HTML_HEAD6_OFF:
1655 case HTML_BLOCKQUOTE_ON:
1656 case HTML_BLOCKQUOTE_OFF:
1657 case HTML_ADDRESS_ON:
1658 case HTML_ADDRESS_OFF:
1659 case HTML_HORZRULE:
1661 case HTML_CENTER_ON:
1662 case HTML_CENTER_OFF:
1663 case HTML_DIVISION_ON:
1664 case HTML_DIVISION_OFF:
1666 case HTML_SCRIPT_ON:
1667 case HTML_SCRIPT_OFF:
1668 case HTML_RAWDATA:
1670 case HTML_TABLE_ON:
1671 case HTML_TABLE_OFF:
1672 case HTML_CAPTION_ON:
1673 case HTML_CAPTION_OFF:
1674 case HTML_COLGROUP_ON:
1675 case HTML_COLGROUP_OFF:
1676 case HTML_COL_ON:
1677 case HTML_COL_OFF:
1678 case HTML_THEAD_ON:
1679 case HTML_THEAD_OFF:
1680 case HTML_TFOOT_ON:
1681 case HTML_TFOOT_OFF:
1682 case HTML_TBODY_ON:
1683 case HTML_TBODY_OFF:
1684 case HTML_TABLEROW_ON:
1685 case HTML_TABLEROW_OFF:
1686 case HTML_TABLEDATA_ON:
1687 case HTML_TABLEDATA_OFF:
1688 case HTML_TABLEHEADER_ON:
1689 case HTML_TABLEHEADER_OFF:
1691 case HTML_ANCHOR_ON:
1692 case HTML_ANCHOR_OFF:
1693 case HTML_BOLD_ON:
1694 case HTML_BOLD_OFF:
1695 case HTML_ITALIC_ON:
1696 case HTML_ITALIC_OFF:
1697 case HTML_STRIKE_ON:
1698 case HTML_STRIKE_OFF:
1699 case HTML_STRIKETHROUGH_ON:
1700 case HTML_STRIKETHROUGH_OFF:
1701 case HTML_UNDERLINE_ON:
1702 case HTML_UNDERLINE_OFF:
1703 case HTML_BASEFONT_ON:
1704 case HTML_BASEFONT_OFF:
1705 case HTML_FONT_ON:
1706 case HTML_FONT_OFF:
1707 case HTML_BLINK_ON:
1708 case HTML_BLINK_OFF:
1709 case HTML_SPAN_ON:
1710 case HTML_SPAN_OFF:
1711 case HTML_SUBSCRIPT_ON:
1712 case HTML_SUBSCRIPT_OFF:
1713 case HTML_SUPERSCRIPT_ON:
1714 case HTML_SUPERSCRIPT_OFF:
1715 case HTML_BIGPRINT_ON:
1716 case HTML_BIGPRINT_OFF:
1717 case HTML_SMALLPRINT_OFF:
1718 case HTML_SMALLPRINT_ON:
1720 case HTML_EMPHASIS_ON:
1721 case HTML_EMPHASIS_OFF:
1722 case HTML_CITIATION_ON:
1723 case HTML_CITIATION_OFF:
1724 case HTML_STRONG_ON:
1725 case HTML_STRONG_OFF:
1726 case HTML_CODE_ON:
1727 case HTML_CODE_OFF:
1728 case HTML_SAMPLE_ON:
1729 case HTML_SAMPLE_OFF:
1730 case HTML_KEYBOARD_ON:
1731 case HTML_KEYBOARD_OFF:
1732 case HTML_VARIABLE_ON:
1733 case HTML_VARIABLE_OFF:
1734 case HTML_DEFINSTANCE_ON:
1735 case HTML_DEFINSTANCE_OFF:
1736 case HTML_SHORTQUOTE_ON:
1737 case HTML_SHORTQUOTE_OFF:
1738 case HTML_LANGUAGE_ON:
1739 case HTML_LANGUAGE_OFF:
1740 case HTML_AUTHOR_ON:
1741 case HTML_AUTHOR_OFF:
1742 case HTML_PERSON_ON:
1743 case HTML_PERSON_OFF:
1744 case HTML_ACRONYM_ON:
1745 case HTML_ACRONYM_OFF:
1746 case HTML_ABBREVIATION_ON:
1747 case HTML_ABBREVIATION_OFF:
1748 case HTML_INSERTEDTEXT_ON:
1749 case HTML_INSERTEDTEXT_OFF:
1750 case HTML_DELETEDTEXT_ON:
1751 case HTML_DELETEDTEXT_OFF:
1752 case HTML_TELETYPE_ON:
1753 case HTML_TELETYPE_OFF:
1755 break;
1757 // The remainder is treated as an unknown token.
1758 default:
1759 if( nToken )
1761 nToken =
1762 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1763 ? HTML_UNKNOWNCONTROL_OFF
1764 : HTML_UNKNOWNCONTROL_ON );
1766 break;
1769 bPre_IgnoreNewPara = false;
1771 return nToken;
1774 int HTMLParser::FilterXMP( int nToken )
1776 switch( nToken )
1778 case HTML_NEWPARA:
1779 if( bPre_IgnoreNewPara )
1780 nToken = 0;
1781 case HTML_TEXTTOKEN:
1782 case HTML_NONBREAKSPACE:
1783 case HTML_SOFTHYPH:
1784 break; // kept
1786 default:
1787 if( nToken )
1789 if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) )
1791 sSaveToken = "</" + sSaveToken;
1793 else
1794 sSaveToken = "<" + sSaveToken;
1795 if( !aToken.isEmpty() )
1797 UnescapeToken();
1798 sSaveToken += " ";
1799 aToken = sSaveToken + aToken;
1801 else
1802 aToken = sSaveToken;
1803 aToken += ">";
1804 nToken = HTML_TEXTTOKEN;
1806 break;
1809 bPre_IgnoreNewPara = false;
1811 return nToken;
1814 int HTMLParser::FilterListing( int nToken )
1816 switch( nToken )
1818 case HTML_NEWPARA:
1819 if( bPre_IgnoreNewPara )
1820 nToken = 0;
1821 case HTML_TEXTTOKEN:
1822 case HTML_NONBREAKSPACE:
1823 case HTML_SOFTHYPH:
1824 break; // kept
1826 default:
1827 if( nToken )
1829 nToken =
1830 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1831 ? HTML_UNKNOWNCONTROL_OFF
1832 : HTML_UNKNOWNCONTROL_ON );
1834 break;
1837 bPre_IgnoreNewPara = false;
1839 return nToken;
1842 bool HTMLParser::InternalImgToPrivateURL( OUString& rURL )
1844 bool bFound = false;
1846 if( rURL.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon ) )
1848 OUString aName( rURL.copy(14) );
1849 switch( aName[0] )
1851 case 'b':
1852 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata;
1853 break;
1854 case 'd':
1855 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed;
1856 break;
1857 case 'e':
1858 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_embed;
1859 break;
1860 case 'i':
1861 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure;
1862 break;
1863 case 'n':
1864 bFound = aName == OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound;
1865 break;
1868 if( bFound )
1870 OUString sTmp ( rURL );
1871 rURL = OOO_STRING_SVTOOLS_HTML_private_image;
1872 rURL += sTmp;
1875 return bFound;
1878 enum eHtmlMetas {
1879 HTML_META_NONE = 0,
1880 HTML_META_AUTHOR,
1881 HTML_META_DESCRIPTION,
1882 HTML_META_KEYWORDS,
1883 HTML_META_REFRESH,
1884 HTML_META_CLASSIFICATION,
1885 HTML_META_CREATED,
1886 HTML_META_CHANGEDBY,
1887 HTML_META_CHANGED,
1888 HTML_META_GENERATOR,
1889 HTML_META_SDFOOTNOTE,
1890 HTML_META_SDENDNOTE,
1891 HTML_META_CONTENT_TYPE
1894 // <META NAME=xxx>
1895 static HTMLOptionEnum const aHTMLMetaNameTable[] =
1897 { OOO_STRING_SVTOOLS_HTML_META_author, HTML_META_AUTHOR },
1898 { OOO_STRING_SVTOOLS_HTML_META_changed, HTML_META_CHANGED },
1899 { OOO_STRING_SVTOOLS_HTML_META_changedby, HTML_META_CHANGEDBY },
1900 { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION},
1901 { OOO_STRING_SVTOOLS_HTML_META_content_type, HTML_META_CONTENT_TYPE },
1902 { OOO_STRING_SVTOOLS_HTML_META_created, HTML_META_CREATED },
1903 { OOO_STRING_SVTOOLS_HTML_META_description, HTML_META_DESCRIPTION },
1904 { OOO_STRING_SVTOOLS_HTML_META_keywords, HTML_META_KEYWORDS },
1905 { OOO_STRING_SVTOOLS_HTML_META_generator, HTML_META_GENERATOR },
1906 { OOO_STRING_SVTOOLS_HTML_META_refresh, HTML_META_REFRESH },
1907 { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HTML_META_SDENDNOTE },
1908 { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HTML_META_SDFOOTNOTE },
1909 { 0, 0 }
1913 void HTMLParser::AddMetaUserDefined( OUString const & )
1917 bool HTMLParser::ParseMetaOptionsImpl(
1918 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
1919 SvKeyValueIterator *i_pHTTPHeader,
1920 const HTMLOptions& aOptions,
1921 rtl_TextEncoding& o_rEnc )
1923 OUString aName, aContent;
1924 sal_uInt16 nAction = HTML_META_NONE;
1925 bool bHTTPEquiv = false, bChanged = false;
1927 for ( size_t i = aOptions.size(); i; )
1929 const HTMLOption& aOption = aOptions[--i];
1930 switch ( aOption.GetToken() )
1932 case HTML_O_NAME:
1933 aName = aOption.GetString();
1934 if ( HTML_META_NONE==nAction )
1936 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1938 break;
1939 case HTML_O_HTTPEQUIV:
1940 aName = aOption.GetString();
1941 aOption.GetEnum( nAction, aHTMLMetaNameTable );
1942 bHTTPEquiv = true;
1943 break;
1944 case HTML_O_CONTENT:
1945 aContent = aOption.GetString();
1946 break;
1950 if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction )
1952 // if it is not a Description, remove CRs and LFs from CONTENT
1953 aContent = comphelper::string::remove(aContent, '\r');
1954 aContent = comphelper::string::remove(aContent, '\n');
1956 else
1958 // convert line endings for Description
1959 aContent = convertLineEnd(aContent, GetSystemLineEnd());
1963 if ( bHTTPEquiv && i_pHTTPHeader )
1965 // Netscape seems to just ignore a closing ", so we do too
1966 if ( aContent.endsWith("\"") )
1968 aContent = aContent.copy( 0, aContent.getLength() - 1 );
1970 SvKeyValue aKeyValue( aName, aContent );
1971 i_pHTTPHeader->Append( aKeyValue );
1974 switch ( nAction )
1976 case HTML_META_AUTHOR:
1977 if (i_xDocProps.is()) {
1978 i_xDocProps->setAuthor( aContent );
1979 bChanged = true;
1981 break;
1982 case HTML_META_DESCRIPTION:
1983 if (i_xDocProps.is()) {
1984 i_xDocProps->setDescription( aContent );
1985 bChanged = true;
1987 break;
1988 case HTML_META_KEYWORDS:
1989 if (i_xDocProps.is()) {
1990 i_xDocProps->setKeywords(
1991 ::comphelper::string::convertCommaSeparated(aContent));
1992 bChanged = true;
1994 break;
1995 case HTML_META_CLASSIFICATION:
1996 if (i_xDocProps.is()) {
1997 i_xDocProps->setSubject( aContent );
1998 bChanged = true;
2000 break;
2002 case HTML_META_CHANGEDBY:
2003 if (i_xDocProps.is()) {
2004 i_xDocProps->setModifiedBy( aContent );
2006 break;
2008 case HTML_META_CREATED:
2009 case HTML_META_CHANGED:
2010 if ( i_xDocProps.is() && !aContent.isEmpty() &&
2011 comphelper::string::getTokenCount(aContent, ';') == 2 )
2013 Date aDate( (sal_uLong)aContent.getToken(0, ';').toInt32() );
2014 tools::Time aTime( (sal_uLong)aContent.getToken(1, ';').toInt32() );
2015 DateTime aDateTime( aDate, aTime );
2016 ::util::DateTime uDT = aDateTime.GetUNODateTime();
2017 if ( HTML_META_CREATED==nAction )
2018 i_xDocProps->setCreationDate( uDT );
2019 else
2020 i_xDocProps->setModificationDate( uDT );
2021 bChanged = true;
2023 break;
2025 case HTML_META_REFRESH:
2026 DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader,
2027 "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" );
2028 break;
2030 case HTML_META_CONTENT_TYPE:
2031 if ( !aContent.isEmpty() )
2033 o_rEnc = GetEncodingByMIME( aContent );
2035 break;
2037 case HTML_META_NONE:
2038 if ( !bHTTPEquiv )
2040 if (i_xDocProps.is())
2042 uno::Reference<beans::XPropertyContainer> xUDProps
2043 = i_xDocProps->getUserDefinedProperties();
2044 try {
2045 xUDProps->addProperty(aName,
2046 beans::PropertyAttribute::REMOVABLE,
2047 uno::makeAny(OUString(aContent)));
2048 AddMetaUserDefined(aName);
2049 bChanged = true;
2050 } catch (uno::Exception &) {
2051 // ignore
2055 break;
2056 default:
2057 break;
2060 return bChanged;
2063 bool HTMLParser::ParseMetaOptions(
2064 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2065 SvKeyValueIterator *i_pHeader )
2067 sal_uInt16 nContentOption = HTML_O_CONTENT;
2068 rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2070 bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2071 GetOptions(&nContentOption),
2072 eEnc );
2074 // If the encoding is set by a META tag, it may only overwrite the
2075 // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2076 // encodings. Everything else cannot lead to reasonable results.
2077 if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2078 rtl_isOctetTextEncoding( eEnc ) &&
2079 rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2081 eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
2082 SetSrcEncoding( eEnc );
2085 return bRet;
2088 rtl_TextEncoding HTMLParser::GetEncodingByMIME( const OUString& rMime )
2090 OUString sType;
2091 OUString sSubType;
2092 INetContentTypeParameterList aParameters;
2093 if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
2095 const INetContentTypeParameter * pCharset = aParameters.find("charset");
2096 if (pCharset != 0)
2098 OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
2099 return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
2102 return RTL_TEXTENCODING_DONTKNOW;
2105 rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2107 rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2108 if( pHTTPHeader )
2110 SvKeyValue aKV;
2111 for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2112 bCont = pHTTPHeader->GetNext( aKV ) )
2114 if( aKV.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2116 if( !aKV.GetValue().isEmpty() )
2118 eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2123 return eRet;
2126 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
2128 bool bRet = false;
2129 rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2130 if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2132 SetSrcEncoding( eEnc );
2133 bRet = true;
2135 return bRet;
2139 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */