1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <comphelper/string.hxx>
21 #include <o3tl/safeint.hxx>
22 #include <o3tl/string_view.hxx>
23 #include <tools/stream.hxx>
24 #include <tools/debug.hxx>
25 #include <tools/color.hxx>
26 #include <rtl/ustrbuf.hxx>
27 #include <rtl/character.hxx>
28 #include <rtl/tencinfo.h>
29 #include <sal/log.hxx>
30 #include <tools/tenccvt.hxx>
31 #include <tools/datetime.hxx>
32 #include <unotools/datetime.hxx>
33 #include <svl/inettype.hxx>
34 #include <svl/lngmisc.hxx>
35 #include <com/sun/star/beans/PropertyAttribute.hpp>
36 #include <com/sun/star/document/XDocumentProperties.hpp>
38 #include <svtools/parhtml.hxx>
39 #include <svtools/htmltokn.h>
40 #include <svtools/htmlkywd.hxx>
44 using namespace ::com::sun::star
;
47 const sal_Int32
MAX_LEN( 1024 );
49 const sal_Int32
MAX_ENTITY_LEN( 8 );
52 // Tables to convert option values into strings
55 HTMLOptionEnum
<HTMLInputType
> const aInputTypeOptEnums
[] =
57 { OOO_STRING_SVTOOLS_HTML_IT_text
, HTMLInputType::Text
},
58 { OOO_STRING_SVTOOLS_HTML_IT_password
, HTMLInputType::Password
},
59 { OOO_STRING_SVTOOLS_HTML_IT_checkbox
, HTMLInputType::Checkbox
},
60 { OOO_STRING_SVTOOLS_HTML_IT_radio
, HTMLInputType::Radio
},
61 { OOO_STRING_SVTOOLS_HTML_IT_range
, HTMLInputType::Range
},
62 { OOO_STRING_SVTOOLS_HTML_IT_scribble
, HTMLInputType::Scribble
},
63 { OOO_STRING_SVTOOLS_HTML_IT_file
, HTMLInputType::File
},
64 { OOO_STRING_SVTOOLS_HTML_IT_hidden
, HTMLInputType::Hidden
},
65 { OOO_STRING_SVTOOLS_HTML_IT_submit
, HTMLInputType::Submit
},
66 { OOO_STRING_SVTOOLS_HTML_IT_image
, HTMLInputType::Image
},
67 { OOO_STRING_SVTOOLS_HTML_IT_reset
, HTMLInputType::Reset
},
68 { OOO_STRING_SVTOOLS_HTML_IT_button
, HTMLInputType::Button
},
69 { nullptr, HTMLInputType(0) }
73 HTMLOptionEnum
<HTMLTableFrame
> const aTableFrameOptEnums
[] =
75 { OOO_STRING_SVTOOLS_HTML_TF_void
, HTMLTableFrame::Void
},
76 { OOO_STRING_SVTOOLS_HTML_TF_above
, HTMLTableFrame::Above
},
77 { OOO_STRING_SVTOOLS_HTML_TF_below
, HTMLTableFrame::Below
},
78 { OOO_STRING_SVTOOLS_HTML_TF_hsides
, HTMLTableFrame::HSides
},
79 { OOO_STRING_SVTOOLS_HTML_TF_lhs
, HTMLTableFrame::LHS
},
80 { OOO_STRING_SVTOOLS_HTML_TF_rhs
, HTMLTableFrame::RHS
},
81 { OOO_STRING_SVTOOLS_HTML_TF_vsides
, HTMLTableFrame::VSides
},
82 { OOO_STRING_SVTOOLS_HTML_TF_box
, HTMLTableFrame::Box
},
83 { OOO_STRING_SVTOOLS_HTML_TF_border
, HTMLTableFrame::Box
},
84 { nullptr, HTMLTableFrame(0) }
88 HTMLOptionEnum
<HTMLTableRules
> const aTableRulesOptEnums
[] =
90 { OOO_STRING_SVTOOLS_HTML_TR_none
, HTMLTableRules::NONE
},
91 { OOO_STRING_SVTOOLS_HTML_TR_groups
, HTMLTableRules::Groups
},
92 { OOO_STRING_SVTOOLS_HTML_TR_rows
, HTMLTableRules::Rows
},
93 { OOO_STRING_SVTOOLS_HTML_TR_cols
, HTMLTableRules::Cols
},
94 { OOO_STRING_SVTOOLS_HTML_TR_all
, HTMLTableRules::All
},
95 { nullptr, HTMLTableRules(0) }
99 HTMLOption::HTMLOption( HtmlOptionId nTok
, OUString _aToken
,
101 : aValue(std::move(_aValue
))
102 , aToken(std::move(_aToken
))
105 DBG_ASSERT( nToken
>=HtmlOptionId::BOOL_START
&& nToken
<HtmlOptionId::END
,
106 "HTMLOption: unknown token" );
109 sal_uInt32
HTMLOption::GetNumber() const
111 DBG_ASSERT( (nToken
>=HtmlOptionId::NUMBER_START
&&
112 nToken
<HtmlOptionId::NUMBER_END
) ||
113 (nToken
>=HtmlOptionId::CONTEXT_START
&&
114 nToken
<HtmlOptionId::CONTEXT_END
) ||
115 nToken
==HtmlOptionId::VALUE
,
116 "GetNumber: Option not numerical" );
117 OUString
aTmp(comphelper::string::stripStart(aValue
, ' '));
118 sal_Int32 nTmp
= aTmp
.toInt32();
119 return nTmp
>= 0 ? static_cast<sal_uInt32
>(nTmp
) : 0;
122 sal_Int32
HTMLOption::GetSNumber() const
124 DBG_ASSERT( (nToken
>=HtmlOptionId::NUMBER_START
&& nToken
<HtmlOptionId::NUMBER_END
) ||
125 (nToken
>=HtmlOptionId::CONTEXT_START
&& nToken
<HtmlOptionId::CONTEXT_END
),
126 "GetSNumber: Option not numerical" );
127 OUString
aTmp(comphelper::string::stripStart(aValue
, ' '));
128 return aTmp
.toInt32();
131 void HTMLOption::GetNumbers( std::vector
<sal_uInt32
> &rNumbers
) const
135 // This is a very simplified scanner: it only searches all
136 // numerals in the string.
139 for( sal_Int32 i
=0; i
<aValue
.getLength(); i
++ )
141 sal_Unicode c
= aValue
[ i
];
142 if( c
>='0' && c
<='9' )
150 rNumbers
.push_back( nNum
);
157 rNumbers
.push_back( nNum
);
161 void HTMLOption::GetColor( Color
& rColor
) const
163 DBG_ASSERT( (nToken
>=HtmlOptionId::COLOR_START
&& nToken
<HtmlOptionId::COLOR_END
) || nToken
==HtmlOptionId::SIZE
,
164 "GetColor: Option is not a color." );
166 OUString
aTmp(aValue
.toAsciiLowerCase());
167 sal_uInt32 nColor
= SAL_MAX_UINT32
;
168 if (!aTmp
.isEmpty() && aTmp
[0] != '#')
169 nColor
= GetHTMLColor(aTmp
);
171 if( SAL_MAX_UINT32
== nColor
)
175 for (sal_uInt32 i
=0; i
<6; ++i
)
177 // Whatever Netscape does to get color values,
178 // at maximum three characters < '0' are ignored.
179 sal_Unicode c
= nPos
<aTmp
.getLength() ? aTmp
[ nPos
++ ] : '0';
182 c
= nPos
<aTmp
.getLength() ? aTmp
[nPos
++] : '0';
184 c
= nPos
<aTmp
.getLength() ? aTmp
[nPos
++] : '0';
187 if( c
>= '0' && c
<= '9' )
189 else if( c
>= 'a' && c
<= 'f' )
190 nColor
+= (c
+ 0xa - 'a');
194 rColor
.SetRed( static_cast<sal_uInt8
>((nColor
& 0x00ff0000) >> 16) );
195 rColor
.SetGreen( static_cast<sal_uInt8
>((nColor
& 0x0000ff00) >> 8));
196 rColor
.SetBlue( static_cast<sal_uInt8
>(nColor
& 0x000000ff) );
199 HTMLInputType
HTMLOption::GetInputType() const
201 DBG_ASSERT( nToken
==HtmlOptionId::TYPE
, "GetInputType: Option not TYPE" );
202 return GetEnum( aInputTypeOptEnums
, HTMLInputType::Text
);
205 HTMLTableFrame
HTMLOption::GetTableFrame() const
207 DBG_ASSERT( nToken
==HtmlOptionId::FRAME
, "GetTableFrame: Option not FRAME" );
208 return GetEnum( aTableFrameOptEnums
);
211 HTMLTableRules
HTMLOption::GetTableRules() const
213 DBG_ASSERT( nToken
==HtmlOptionId::RULES
, "GetTableRules: Option not RULES" );
214 return GetEnum( aTableRulesOptEnums
);
217 HTMLParser::HTMLParser( SvStream
& rIn
, bool bReadNewDoc
) :
218 SvParser
<HtmlTokenId
>( rIn
),
219 bNewDoc(bReadNewDoc
),
224 bReadTextArea(false),
227 bEndTokenFound(false),
228 bPre_IgnoreNewPara(false),
229 bReadNextChar(false),
232 mnPendingOffToken(HtmlTokenId::NONE
)
234 //#i76649, default to UTF-8 for HTML unless we know differently
235 SetSrcEncoding(RTL_TEXTENCODING_UTF8
);
238 HTMLParser::~HTMLParser()
242 void HTMLParser::SetNamespace(std::u16string_view rNamespace
)
244 // Convert namespace alias to a prefix.
245 maNamespace
= OUString::Concat(rNamespace
) + ":";
253 HTMLParser
& m_rParser
;
255 RefGuard(HTMLParser
& rParser
)
258 m_rParser
.AddFirstRef();
263 if (m_rParser
.GetStatus() != SvParserState::Pending
)
264 m_rParser
.ReleaseRef(); // Parser not needed anymore
269 SvParserState
HTMLParser::CallParser()
271 eState
= SvParserState::Working
;
272 nNextCh
= GetNextChar();
273 SaveState( HtmlTokenId::NONE
);
276 bPre_IgnoreNewPara
= false;
278 RefGuard
aRefGuard(*this);
280 Continue( HtmlTokenId::NONE
);
285 void HTMLParser::Continue( HtmlTokenId nToken
)
287 if( nToken
== HtmlTokenId::NONE
)
288 nToken
= GetNextToken();
290 while( IsParserWorking() )
293 nToken
= FilterToken( nToken
);
295 if( nToken
!= HtmlTokenId::NONE
)
298 if( IsParserWorking() )
299 SaveState( HtmlTokenId::NONE
); // continue with new token
301 nToken
= GetNextToken();
305 HtmlTokenId
HTMLParser::FilterToken( HtmlTokenId nToken
)
309 case HtmlTokenId(EOF
):
310 nToken
= HtmlTokenId::NONE
;
313 case HtmlTokenId::HEAD_OFF
:
317 case HtmlTokenId::HEAD_ON
:
321 case HtmlTokenId::BODY_ON
:
325 case HtmlTokenId::FRAMESET_ON
:
329 case HtmlTokenId::BODY_OFF
:
330 bReadPRE
= bReadListing
= bReadXMP
= false;
333 case HtmlTokenId::HTML_OFF
:
334 nToken
= HtmlTokenId::NONE
;
335 bReadPRE
= bReadListing
= bReadXMP
= false;
336 break; // HtmlTokenId::ON hasn't been passed either !
338 case HtmlTokenId::PREFORMTXT_ON
:
342 case HtmlTokenId::PREFORMTXT_OFF
:
346 case HtmlTokenId::LISTING_ON
:
350 case HtmlTokenId::LISTING_OFF
:
354 case HtmlTokenId::XMP_ON
:
358 case HtmlTokenId::XMP_OFF
:
364 nToken
= FilterPRE( nToken
);
365 else if( bReadListing
)
366 nToken
= FilterListing( nToken
);
368 nToken
= FilterXMP( nToken
);
378 constexpr bool HTML_ISPRINTABLE(sal_Unicode c
) { return c
>= 32 && c
!= 127; }
382 HtmlTokenId
HTMLParser::ScanText( const sal_Unicode cBreak
)
384 OUStringBuffer
sTmpBuffer( MAX_LEN
);
385 bool bContinue
= true;
386 bool bEqSignFound
= false;
387 sal_uInt32 cQuote
= 0U;
389 while( bContinue
&& IsParserWorking() )
395 bEqSignFound
= false;
397 sTmpBuffer
.append( '&' );
400 sal_uInt64 nStreamPos
= rInput
.Tell();
401 sal_uInt32 nLinePos
= GetLinePos();
403 sal_uInt32 cChar
= 0U;
404 if( '#' == (nNextCh
= GetNextChar()) )
406 nNextCh
= GetNextChar();
407 const bool bIsHex( 'x' == nNextCh
);
408 const bool bIsDecOrHex( bIsHex
|| rtl::isAsciiDigit(nNextCh
) );
413 nNextCh
= GetNextChar();
414 while ( rtl::isAsciiHexDigit(nNextCh
) )
416 cChar
= cChar
* 16U +
418 ? sal_uInt32( nNextCh
- '0' )
420 ? sal_uInt32( nNextCh
- 'A' + 10 )
421 : sal_uInt32( nNextCh
- 'a' + 10 ) ) );
422 nNextCh
= GetNextChar();
429 cChar
= cChar
* 10U + sal_uInt32( nNextCh
- '0');
430 nNextCh
= GetNextChar();
432 while( rtl::isAsciiDigit(nNextCh
) );
435 if( RTL_TEXTENCODING_DONTKNOW
!= eSrcEnc
&&
436 RTL_TEXTENCODING_UCS2
!= eSrcEnc
&&
437 RTL_TEXTENCODING_UTF8
!= eSrcEnc
&&
440 const sal_uInt32 convertFlags
=
441 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
|
442 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
|
443 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
;
445 char cEncodedChar
= static_cast<char>(cChar
);
446 cChar
= OUString(&cEncodedChar
, 1, eSrcEnc
, convertFlags
).toChar();
449 // If the character could not be
450 // converted, because a conversion is not
451 // available, do no conversion at all.
452 cChar
= cEncodedChar
;
459 if (!rtl::isUnicodeCodePoint(cChar
)
460 || (linguistic::IsControlChar(cChar
)
461 && cChar
!= '\r' && cChar
!= '\n' && cChar
!= '\t'))
466 else if( rtl::isAsciiAlpha( nNextCh
) )
468 OUStringBuffer
sEntityBuffer( MAX_ENTITY_LEN
);
472 sEntityBuffer
.appendUtf32( nNextCh
);
474 nNextCh
= GetNextChar();
476 while( nPos
< MAX_ENTITY_LEN
&& rtl::isAsciiAlphanumeric( nNextCh
) &&
479 if( IsParserWorking() && !rInput
.eof() )
481 std::u16string_view
sEntity(sEntityBuffer
.subView(0, nPos
));
482 cChar
= GetHTMLCharName( sEntity
);
484 // not found ( == 0 ): plain text
485 // or a character which is inserted as attribute
486 if( 0U == cChar
&& ';' != nNextCh
)
488 DBG_ASSERT( rInput
.Tell() - nStreamPos
==
489 static_cast<sal_uInt64
>(nPos
+1)*GetCharSize(),
490 "UTF-8 is failing here" );
491 for( sal_Int32 i
= nPos
-1; i
>1; i
-- )
493 nNextCh
= sEntityBuffer
[i
];
494 sEntityBuffer
.setLength( i
);
495 sEntity
= sEntityBuffer
.subView(0, i
);
496 cChar
= GetHTMLCharName( sEntity
);
499 rInput
.SeekRel( -static_cast<sal_Int64
>
500 (nPos
-i
)*GetCharSize() );
501 nlLinePos
-= sal_uInt32(nPos
-i
);
503 ClearTxtConvContext();
509 if( !cChar
) // unknown character?
511 // back in stream, insert '&'
512 // and restart with next character
513 sTmpBuffer
.append( '&' );
515 DBG_ASSERT( rInput
.Tell()-nStreamPos
==
516 static_cast<sal_uInt64
>(nPos
+1)*GetCharSize(),
517 "Wrong stream position" );
518 DBG_ASSERT( nlLinePos
-nLinePos
==
519 static_cast<sal_uInt32
>(nPos
+1),
520 "Wrong line position" );
521 rInput
.Seek( nStreamPos
);
522 nlLinePos
= nLinePos
;
523 ClearTxtConvContext();
529 // 1 == Non Breaking Space
532 if (cChar
== 1 || cChar
== 2)
536 // When reading the content of a tag we have
537 // to change it to ' ' or '-'
545 // If not scanning a tag return token
546 aToken
.append( sTmpBuffer
);
547 sTmpBuffer
.setLength(0);
549 if( !aToken
.isEmpty() )
551 // restart with character
553 DBG_ASSERT( rInput
.Tell()-nStreamPos
==
554 static_cast<sal_uInt64
>(nPos
+1)*GetCharSize(),
555 "Wrong stream position" );
556 DBG_ASSERT( nlLinePos
-nLinePos
==
557 static_cast<sal_uInt32
>(nPos
+1),
558 "Wrong line position" );
559 rInput
.Seek( nStreamPos
);
560 nlLinePos
= nLinePos
;
561 ClearTxtConvContext();
562 return HtmlTokenId::TEXTTOKEN
;
565 // Hack: _GetNextChar shall not read the
568 aToken
.append( " " );
570 return HtmlTokenId::NONBREAKSPACE
;
572 return HtmlTokenId::SOFTHYPH
;
579 // &{...};-JavaScript-Macros are not supported any longer.
580 else if( IsParserWorking() )
582 sTmpBuffer
.append( '&' );
587 bNextCh
= (';' == nNextCh
);
588 if( cBreak
=='>' && (cChar
=='\\' || cChar
=='\'' ||
589 cChar
=='\"' || cChar
==' ') )
591 // ' and " have to be escaped within tags to separate
592 // them from ' and " enclosing options.
593 // \ has to be escaped as well.
594 // Space is protected because it's not a delimiter between
596 sTmpBuffer
.append( '\\' );
598 if( IsParserWorking() )
601 sTmpBuffer
.appendUtf32( cChar
);
603 else if( SvParserState::Pending
==eState
&& '>'!=cBreak
)
605 // Restart with '&', the remainder is returned as
607 if( !aToken
.isEmpty() || !sTmpBuffer
.isEmpty() )
609 // _GetNextChar() returns the previous text and
610 // during the next execution a new character is read.
611 // Thus we have to position in front of the '&'.
613 rInput
.Seek( nStreamPos
- GetCharSize() );
614 nlLinePos
= nLinePos
-1;
615 ClearTxtConvContext();
616 bReadNextChar
= true;
623 if( '>'==cBreak
&& !cQuote
)
625 sTmpBuffer
.appendUtf32( nNextCh
);
632 sTmpBuffer
.append( '\\' );
634 sTmpBuffer
.append( '\\' );
643 else if( cQuote
&& (cQuote
==nNextCh
) )
646 sTmpBuffer
.appendUtf32( nNextCh
);
647 bEqSignFound
= false;
650 case sal_Unicode(EOF
):
655 // else: ignore, not a valid code point
659 bEqSignFound
= false;
661 sTmpBuffer
.appendUtf32( nNextCh
);
663 bContinue
= false; // break, string is together
669 // If scanning options treat it like a space, ...
670 sTmpBuffer
.append( ' ' );
674 // otherwise it's a separate token.
683 // cr/lf in tag is handled in GetNextToken_()
684 sTmpBuffer
.appendUtf32( nNextCh
);
687 else if( bReadListing
|| bReadXMP
|| bReadPRE
|| bReadTextArea
)
692 // Reduce sequence of CR/LF/BLANK/TAB to a single blank
695 if( '\t'==nNextCh
&& bReadPRE
&& '>'!=cBreak
)
697 // Pass Tabs up in <PRE>
703 if( '\x0b'==nNextCh
&& (bReadPRE
|| bReadXMP
||bReadListing
) &&
711 sTmpBuffer
.appendUtf32( nNextCh
);
712 if( '>'!=cBreak
&& (!bReadListing
&& !bReadXMP
&&
713 !bReadPRE
&& !bReadTextArea
) )
715 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
717 nNextCh
= GetNextChar();
718 if( sal_Unicode(EOF
) == nNextCh
&& rInput
.eof() )
720 if( !aToken
.isEmpty() || sTmpBuffer
.getLength() > 1 )
722 // Have seen s.th. aside from blanks?
723 aToken
.append( sTmpBuffer
);
724 sTmpBuffer
.setLength(0);
725 return HtmlTokenId::TEXTTOKEN
;
728 // Only read blanks: no text must be returned
729 // and GetNextToken_ has to read until EOF
730 return HtmlTokenId::NONE
;
732 } while ( ' ' == nNextCh
|| '\t' == nNextCh
||
733 '\r' == nNextCh
|| '\n' == nNextCh
||
740 bEqSignFound
= false;
741 if (nNextCh
== cBreak
&& !cQuote
)
746 if (!linguistic::IsControlChar(nNextCh
))
748 // All remaining characters make their way into the text.
749 sTmpBuffer
.appendUtf32( nNextCh
);
752 nNextCh
= GetNextChar();
753 if( ( sal_Unicode(EOF
) == nNextCh
&& rInput
.eof() ) ||
756 if( !sTmpBuffer
.isEmpty() )
757 aToken
.append( sTmpBuffer
);
758 return HtmlTokenId::TEXTTOKEN
;
760 } while( rtl::isAsciiAlpha( nNextCh
) || rtl::isAsciiDigit( nNextCh
) );
765 if( bContinue
&& bNextCh
)
766 nNextCh
= GetNextChar();
769 if( !sTmpBuffer
.isEmpty() )
770 aToken
.append( sTmpBuffer
);
772 return HtmlTokenId::TEXTTOKEN
;
775 HtmlTokenId
HTMLParser::GetNextRawToken()
777 OUStringBuffer
sTmpBuffer( MAX_LEN
);
781 // During the last execution we already found the end token,
782 // thus we don't have to search it again.
786 bEndTokenFound
= false;
788 return HtmlTokenId::NONE
;
791 // Default return value: HtmlTokenId::RAWDATA
792 bool bContinue
= true;
793 HtmlTokenId nToken
= HtmlTokenId::RAWDATA
;
794 SaveState( HtmlTokenId::NONE
);
795 while( bContinue
&& IsParserWorking() )
802 // Maybe we've reached the end.
804 // Save what we have read previously...
805 aToken
.append( sTmpBuffer
);
806 sTmpBuffer
.setLength(0);
808 // and remember position in stream.
809 sal_uInt64 nStreamPos
= rInput
.Tell();
810 sal_uInt32 nLineNr
= GetLineNr();
811 sal_uInt32 nLinePos
= GetLinePos();
813 // Start of an end token?
814 bool bOffState
= false;
815 if( '/' == (nNextCh
= GetNextChar()) )
818 nNextCh
= GetNextChar();
820 else if( '!' == nNextCh
)
822 sTmpBuffer
.appendUtf32( nNextCh
);
823 nNextCh
= GetNextChar();
826 // Read following letters
827 while( (rtl::isAsciiAlpha(nNextCh
) || '-'==nNextCh
) &&
828 IsParserWorking() && sTmpBuffer
.getLength() < MAX_LEN
)
830 sTmpBuffer
.appendUtf32( nNextCh
);
831 nNextCh
= GetNextChar();
834 OUString
aTok( sTmpBuffer
.toString() );
835 aTok
= aTok
.toAsciiLowerCase();
837 if( bReadScript
|| !aEndToken
.isEmpty() )
841 if( aTok
.startsWith( OOO_STRING_SVTOOLS_HTML_comment
) )
847 // A script has to end with "</SCRIPT>". But
848 // ">" is optional for security reasons
851 ? aTok
== OOO_STRING_SVTOOLS_HTML_script
852 : aTok
== aEndToken
);
855 if( bReadComment
&& '>'==nNextCh
&& aTok
.endsWith( "--" ) )
857 // End of comment of style <!----->
858 bReadComment
= false;
863 // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
865 bDone
= aTok
== OOO_STRING_SVTOOLS_HTML_style
||
866 aTok
== OOO_STRING_SVTOOLS_HTML_head
;
868 bDone
= aTok
== OOO_STRING_SVTOOLS_HTML_body
;
873 // Done! Return the previously read string (if requested)
878 // nToken==0 means, GetNextToken_ continues to read
879 if( aToken
.isEmpty() && (bReadStyle
|| bReadScript
) )
881 // Immediately close environment (or context?)
882 // and parse the end token
886 nToken
= HtmlTokenId::NONE
;
890 // Keep bReadScript/bReadStyle alive
891 // and parse end token during next execution
892 bEndTokenFound
= true;
895 // Move backwards in stream to '<'
896 rInput
.Seek( nStreamPos
);
897 SetLineNr( nLineNr
);
898 SetLinePos( nLinePos
);
899 ClearTxtConvContext();
902 // Don't append string to token.
903 sTmpBuffer
.setLength( 0 );
907 // remember "</" , everything else we find in the buffer
908 aToken
.append( "<" );
910 aToken
.append( "/" );
917 sTmpBuffer
.appendUtf32( nNextCh
);
920 bool bTwoMinus
= false;
921 nNextCh
= GetNextChar();
922 while( '-' == nNextCh
&& IsParserWorking() )
925 sTmpBuffer
.appendUtf32( nNextCh
);
926 nNextCh
= GetNextChar();
929 if( '>' == nNextCh
&& IsParserWorking() && bTwoMinus
)
930 bReadComment
= false;
937 // \r\n? closes the current text token (even if it's empty)
938 nNextCh
= GetNextChar();
940 nNextCh
= GetNextChar();
944 // \n closes the current text token (even if it's empty)
945 nNextCh
= GetNextChar();
948 case sal_Unicode(EOF
):
949 // eof closes the current text token and behaves like having read
954 if( !aToken
.isEmpty() || !sTmpBuffer
.isEmpty() )
956 bEndTokenFound
= true;
963 nToken
= HtmlTokenId::NONE
;
968 if (!linguistic::IsControlChar(nNextCh
) || nNextCh
== '\t')
970 // all remaining characters are appended to the buffer
971 sTmpBuffer
.appendUtf32( nNextCh
);
976 if( !bContinue
&& !sTmpBuffer
.isEmpty() )
978 aToken
.append( sTmpBuffer
);
979 sTmpBuffer
.setLength(0);
982 if( bContinue
&& bNextCh
)
983 nNextCh
= GetNextChar();
986 if( IsParserWorking() )
987 SaveState( HtmlTokenId::NONE
);
989 nToken
= HtmlTokenId::NONE
;
995 HtmlTokenId
HTMLParser::GetNextToken_()
997 HtmlTokenId nRet
= HtmlTokenId::NONE
;
1000 if (mnPendingOffToken
!= HtmlTokenId::NONE
)
1002 // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
1003 nRet
= mnPendingOffToken
;
1004 mnPendingOffToken
= HtmlTokenId::NONE
;
1005 aToken
.setLength( 0 );
1012 if( !IsParserWorking() ) // Don't continue if already an error occurred
1013 return HtmlTokenId::NONE
;
1015 bool bReadNextCharSave
= bReadNextChar
;
1018 DBG_ASSERT( !bEndTokenFound
,
1019 "Read a character despite </SCRIPT> was read?" );
1020 nNextCh
= GetNextChar();
1021 if( !IsParserWorking() ) // Don't continue if already an error occurred
1022 return HtmlTokenId::NONE
;
1023 bReadNextChar
= false;
1026 if( bReadScript
|| bReadStyle
|| !aEndToken
.isEmpty() )
1028 nRet
= GetNextRawToken();
1029 if( nRet
!= HtmlTokenId::NONE
|| !IsParserWorking() )
1034 bool bNextCh
= true;
1039 sal_uInt64 nStreamPos
= rInput
.Tell();
1040 sal_uInt32 nLineNr
= GetLineNr();
1041 sal_uInt32 nLinePos
= GetLinePos();
1043 bool bOffState
= false;
1044 if( '/' == (nNextCh
= GetNextChar()) )
1047 nNextCh
= GetNextChar();
1049 // Assume '<?' is a start of an XML declaration, ignore it.
1050 if (rtl::isAsciiAlpha(nNextCh
) || nNextCh
== '!' || nNextCh
== '?')
1052 OUStringBuffer sTmpBuffer
;
1054 sTmpBuffer
.appendUtf32( nNextCh
);
1055 nNextCh
= GetNextChar();
1056 if (std::u16string_view(sTmpBuffer
) == u
"![CDATA[")
1058 if (bFuzzing
&& sTmpBuffer
.getLength() > 1024)
1060 SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
1061 eState
= SvParserState::Error
;
1064 } while( '>' != nNextCh
&& '/' != nNextCh
&& !rtl::isAsciiWhiteSpace( nNextCh
) &&
1065 !linguistic::IsControlChar(nNextCh
) &&
1066 IsParserWorking() && !rInput
.eof() );
1068 if( !sTmpBuffer
.isEmpty() )
1070 aToken
.append( sTmpBuffer
);
1071 sTmpBuffer
.setLength(0);
1075 while( rtl::isAsciiWhiteSpace( nNextCh
) && IsParserWorking() )
1076 nNextCh
= GetNextChar();
1078 if( !IsParserWorking() )
1080 if( SvParserState::Pending
== eState
)
1081 bReadNextChar
= bReadNextCharSave
;
1085 // Search token in table:
1086 sSaveToken
= aToken
;
1087 aToken
= aToken
.toString().toAsciiLowerCase();
1089 if (!maNamespace
.isEmpty() && o3tl::starts_with(aToken
, maNamespace
))
1090 aToken
.remove( 0, maNamespace
.getLength());
1092 if( HtmlTokenId::NONE
== (nRet
= GetHTMLToken( aToken
)) )
1094 nRet
= HtmlTokenId::UNKNOWNCONTROL_ON
;
1096 // If it's a token which can be switched off...
1099 if( nRet
>= HtmlTokenId::ONOFF_START
)
1101 // and there is an off token, return off token instead
1102 nRet
= static_cast<HtmlTokenId
>(static_cast<int>(nRet
) + 1);
1104 else if( HtmlTokenId::LINEBREAK
!=nRet
|| !maNamespace
.isEmpty())
1106 // and there is no off token, return unknown token.
1107 // (except for </BR>, that is treated like <BR>)
1108 // No exception for XHTML, though.
1109 nRet
= HtmlTokenId::UNKNOWNCONTROL_OFF
;
1113 if( nRet
== HtmlTokenId::COMMENT
)
1115 // fix: due to being case sensitive use sSaveToken as start of comment
1116 // and append a blank.
1117 aToken
= sSaveToken
;
1119 aToken
.append( " " );
1120 sal_uInt64 nCStreamPos
= 0;
1121 sal_uInt32 nCLineNr
= 0;
1122 sal_uInt32 nCLinePos
= 0;
1123 sal_Int32 nCStrLen
= 0;
1126 // Read until closing -->. If not found restart at first >
1127 sTmpBuffer
= aToken
;
1128 while( !bDone
&& !rInput
.eof() && IsParserWorking() )
1134 nCStreamPos
= rInput
.Tell();
1135 nCStrLen
= sTmpBuffer
.getLength();
1136 nCLineNr
= GetLineNr();
1137 nCLinePos
= GetLinePos();
1139 bDone
= sTmpBuffer
.getLength() >= 2 && sTmpBuffer
[sTmpBuffer
.getLength() - 2] == '-' && sTmpBuffer
[sTmpBuffer
.getLength() - 1] == '-';
1141 sTmpBuffer
.appendUtf32(nNextCh
);
1143 else if (!linguistic::IsControlChar(nNextCh
)
1144 || nNextCh
== '\r' || nNextCh
== '\n' || nNextCh
== '\t')
1146 sTmpBuffer
.appendUtf32(nNextCh
);
1149 nNextCh
= GetNextChar();
1151 aToken
= sTmpBuffer
;
1152 sTmpBuffer
.setLength(0);
1153 if( !bDone
&& IsParserWorking() && nCStreamPos
)
1155 rInput
.Seek( nCStreamPos
);
1156 SetLineNr( nCLineNr
);
1157 SetLinePos( nCLinePos
);
1158 ClearTxtConvContext();
1159 aToken
.truncate(nCStrLen
);
1163 else if (nRet
== HtmlTokenId::CDATA
)
1165 // Read until the closing ]]>.
1167 while (!bDone
&& !rInput
.eof() && IsParserWorking())
1171 if (sTmpBuffer
.getLength() >= 2)
1173 bDone
= sTmpBuffer
[sTmpBuffer
.getLength() - 2] == ']'
1174 && sTmpBuffer
[sTmpBuffer
.getLength() - 1] == ']';
1177 // Ignore ]] at the end.
1178 sTmpBuffer
.setLength(sTmpBuffer
.getLength() - 2);
1183 sTmpBuffer
.appendUtf32(nNextCh
);
1186 else if (!linguistic::IsControlChar(nNextCh
))
1188 sTmpBuffer
.appendUtf32(nNextCh
);
1192 nNextCh
= GetNextChar();
1195 aToken
= sTmpBuffer
;
1196 sTmpBuffer
.setLength(0);
1200 // TokenString not needed anymore
1201 aToken
.setLength( 0 );
1204 // Read until closing '>'
1205 if( '>' != nNextCh
&& IsParserWorking() )
1209 // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1210 // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
1211 // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
1212 // which lead to fdo#56772.
1213 if ((nRet
>= HtmlTokenId::ONOFF_START
) && o3tl::ends_with(aToken
, u
"/"))
1215 mnPendingOffToken
= static_cast<HtmlTokenId
>(static_cast<int>(nRet
) + 1); // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
1216 aToken
.setLength( aToken
.getLength()-1 ); // remove trailing '/'
1218 if( sal_Unicode(EOF
) == nNextCh
&& rInput
.eof() )
1220 // Move back in front of < and restart there.
1221 // Return < as text.
1222 rInput
.Seek( nStreamPos
);
1223 SetLineNr( nLineNr
);
1224 SetLinePos( nLinePos
);
1225 ClearTxtConvContext();
1228 nRet
= HtmlTokenId::TEXTTOKEN
;
1229 nNextCh
= GetNextChar();
1234 if( SvParserState::Pending
== eState
)
1235 bReadNextChar
= bReadNextCharSave
;
1241 // simply throw away everything
1243 if( sal_Unicode(EOF
) == nNextCh
&& rInput
.eof() )
1245 // Move back in front of < and restart there.
1246 // Return < as text.
1247 rInput
.Seek( nStreamPos
);
1248 SetLineNr( nLineNr
);
1249 SetLinePos( nLinePos
);
1250 ClearTxtConvContext();
1253 nRet
= HtmlTokenId::TEXTTOKEN
;
1254 nNextCh
= GetNextChar();
1258 if( SvParserState::Pending
== eState
)
1259 bReadNextChar
= bReadNextCharSave
;
1260 aToken
.setLength( 0 );
1262 else if( '%' == nNextCh
)
1264 nRet
= HtmlTokenId::UNKNOWNCONTROL_ON
;
1266 sal_uInt64 nCStreamPos
= rInput
.Tell();
1267 sal_uInt32 nCLineNr
= GetLineNr(), nCLinePos
= GetLinePos();
1270 // Read until closing %>. If not found restart at first >.
1271 sal_Unicode nLastTokenChar
= !aToken
.isEmpty() ? aToken
[aToken
.getLength() - 1] : 0;
1272 OUStringBuffer
aTmpBuffer(aToken
);
1273 while( !bDone
&& !rInput
.eof() && IsParserWorking() )
1275 bDone
= '>'==nNextCh
&& nLastTokenChar
== '%';
1278 aTmpBuffer
.appendUtf32(nNextCh
);
1279 nLastTokenChar
= aTmpBuffer
[aTmpBuffer
.getLength() - 1];
1280 nNextCh
= GetNextChar();
1283 if( !bDone
&& IsParserWorking() )
1285 rInput
.Seek( nCStreamPos
);
1286 SetLineNr( nCLineNr
);
1287 SetLinePos( nCLinePos
);
1288 ClearTxtConvContext();
1290 nRet
= HtmlTokenId::TEXTTOKEN
;
1293 aToken
= aTmpBuffer
;
1294 aTmpBuffer
.setLength(0);
1295 if( IsParserWorking() )
1297 sSaveToken
= aToken
;
1298 aToken
.setLength( 0 );
1304 nRet
= HtmlTokenId::TEXTTOKEN
;
1310 if( IsParserWorking() )
1312 bNextCh
= '>' == nNextCh
;
1315 case HtmlTokenId::TEXTAREA_ON
:
1316 bReadTextArea
= true;
1318 case HtmlTokenId::TEXTAREA_OFF
:
1319 bReadTextArea
= false;
1321 case HtmlTokenId::SCRIPT_ON
:
1322 if( !bReadTextArea
)
1325 case HtmlTokenId::SCRIPT_OFF
:
1326 if( !bReadTextArea
)
1328 bReadScript
= false;
1329 // JavaScript might modify the stream,
1330 // thus the last character has to be read again.
1331 bReadNextChar
= true;
1336 case HtmlTokenId::STYLE_ON
:
1339 case HtmlTokenId::STYLE_OFF
:
1348 case sal_Unicode(EOF
):
1351 eState
= SvParserState::Accepted
;
1352 nRet
= HtmlTokenId(nNextCh
);
1356 // Read normal text.
1362 // form feeds are passed upwards separately
1363 nRet
= HtmlTokenId::LINEFEEDCHAR
; // !!! should be FORMFEEDCHAR
1368 if( bReadListing
|| bReadXMP
|| bReadPRE
|| bReadTextArea
)
1370 sal_Unicode c
= GetNextChar();
1371 if( ( '\n' != nNextCh
|| '\r' != c
) &&
1372 ( '\r' != nNextCh
|| '\n' != c
) )
1377 nRet
= HtmlTokenId::NEWPARA
;
1384 nRet
= HtmlTokenId::TABCHAR
;
1393 // "normal" text to come
1395 bNextCh
= 0 == aToken
.getLength();
1397 // the text should be processed
1398 if( !bNextCh
&& eState
== SvParserState::Pending
)
1400 eState
= SvParserState::Working
;
1401 bReadNextChar
= true;
1407 if( bNextCh
&& SvParserState::Working
== eState
)
1409 nNextCh
= GetNextChar();
1410 if( SvParserState::Pending
== eState
&& nRet
!= HtmlTokenId::NONE
&& HtmlTokenId::TEXTTOKEN
!= nRet
)
1412 bReadNextChar
= true;
1413 eState
= SvParserState::Working
;
1417 } while( nRet
== HtmlTokenId::NONE
&& SvParserState::Working
== eState
);
1419 if( SvParserState::Pending
== eState
)
1420 nRet
= HtmlTokenId::INVALID
; // s.th. invalid
1425 void HTMLParser::UnescapeToken()
1429 bool bEscape
= false;
1430 while( nPos
< aToken
.getLength() )
1432 bool bOldEscape
= bEscape
;
1434 if( '\\'==aToken
[nPos
] && !bOldEscape
)
1436 aToken
.remove( nPos
, 1 );
1446 const HTMLOptions
& HTMLParser::GetOptions( HtmlOptionId
const *pNoConvertToken
)
1448 // If the options for the current token have already been returned,
1449 // return them once again.
1450 if (!maOptions
.empty())
1454 while( nPos
< aToken
.getLength() )
1456 // A letter? Option beginning here.
1457 if( rtl::isAsciiAlpha( aToken
[nPos
] ) )
1459 HtmlOptionId nToken
;
1461 sal_Int32 nStt
= nPos
;
1462 sal_Unicode cChar
= 0;
1464 // Actually only certain characters allowed.
1465 // Netscape only looks for "=" and white space (c.f.
1466 // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
1467 while( nPos
< aToken
.getLength() )
1469 cChar
= aToken
[nPos
];
1470 if ( '=' == cChar
||!HTML_ISPRINTABLE(cChar
) || rtl::isAsciiWhiteSpace(cChar
) )
1475 OUString
sName( aToken
.subView( nStt
, nPos
-nStt
) );
1477 // PlugIns require original token name. Convert to lower case only for searching.
1478 nToken
= GetHTMLOption( sName
.toAsciiLowerCase() ); // Name is ready
1479 SAL_WARN_IF( nToken
==HtmlOptionId::UNKNOWN
, "svtools",
1480 "GetOption: unknown HTML option '" << sName
<< "'" );
1481 bool bStripCRLF
= (nToken
< HtmlOptionId::SCRIPT_START
||
1482 nToken
>= HtmlOptionId::SCRIPT_END
) &&
1483 (!pNoConvertToken
|| nToken
!= *pNoConvertToken
);
1485 while( nPos
< aToken
.getLength() )
1487 cChar
= aToken
[nPos
];
1488 if ( HTML_ISPRINTABLE(cChar
) && !rtl::isAsciiWhiteSpace(cChar
) )
1493 // Option with value?
1494 if( nPos
!=aToken
.getLength() && '='==cChar
)
1498 while( nPos
< aToken
.getLength() )
1500 cChar
= aToken
[nPos
];
1501 if ( HTML_ISPRINTABLE(cChar
) && ' ' != cChar
&& '\t' != cChar
&& '\r' != cChar
&& '\n' != cChar
)
1506 if( nPos
!= aToken
.getLength() )
1510 if( ('"'==cChar
) || '\''==cChar
)
1512 sal_Unicode cEnd
= cChar
;
1515 bool bEscape
= false;
1516 while( nPos
< aToken
.getLength() && !bDone
)
1518 bool bOldEscape
= bEscape
;
1520 cChar
= aToken
[nPos
];
1526 aToken
.remove( nPos
, 1 );
1541 aToken
.remove( nPos
, 1 );
1547 bDone
= !bOldEscape
&& cChar
==cEnd
;
1560 if( nPos
!=aToken
.getLength() )
1565 // More liberal than the standard: allow all printable characters
1566 bool bEscape
= false;
1568 while( nPos
< aToken
.getLength() && !bDone
)
1570 bool bOldEscape
= bEscape
;
1572 sal_Unicode c
= aToken
[nPos
];
1576 bDone
= !bOldEscape
;
1598 aToken
.remove( nPos
, 1 );
1604 if( HTML_ISPRINTABLE( c
) )
1617 aValue
= aToken
.subView( nStt
, nLen
);
1621 // Token is known and can be saved
1622 maOptions
.emplace_back(nToken
, sName
, aValue
);
1626 // Ignore white space and unexpected characters
1633 HtmlTokenId
HTMLParser::FilterPRE( HtmlTokenId nToken
)
1637 // in Netscape they only have impact in not empty paragraphs
1638 case HtmlTokenId::PARABREAK_ON
:
1639 nToken
= HtmlTokenId::LINEBREAK
;
1641 case HtmlTokenId::LINEBREAK
:
1642 case HtmlTokenId::NEWPARA
:
1644 if( bPre_IgnoreNewPara
)
1645 nToken
= HtmlTokenId::NONE
;
1648 case HtmlTokenId::TABCHAR
:
1650 sal_Int32 nSpaces
= 8 - (nPre_LinePos
% 8);
1651 DBG_ASSERT( aToken
.isEmpty(), "Why is the token not empty?" );
1652 if (aToken
.getLength() < nSpaces
)
1654 using comphelper::string::padToLength
;
1655 OUStringBuffer
aBuf(aToken
);
1656 aToken
= padToLength(aBuf
, nSpaces
, ' ');
1658 nPre_LinePos
+= nSpaces
;
1659 nToken
= HtmlTokenId::TEXTTOKEN
;
1663 case HtmlTokenId::TEXTTOKEN
:
1664 nPre_LinePos
+= aToken
.getLength();
1667 case HtmlTokenId::SELECT_ON
:
1668 case HtmlTokenId::SELECT_OFF
:
1669 case HtmlTokenId::BODY_ON
:
1670 case HtmlTokenId::FORM_ON
:
1671 case HtmlTokenId::FORM_OFF
:
1672 case HtmlTokenId::INPUT
:
1673 case HtmlTokenId::OPTION
:
1674 case HtmlTokenId::TEXTAREA_ON
:
1675 case HtmlTokenId::TEXTAREA_OFF
:
1677 case HtmlTokenId::IMAGE
:
1678 case HtmlTokenId::APPLET_ON
:
1679 case HtmlTokenId::APPLET_OFF
:
1680 case HtmlTokenId::PARAM
:
1681 case HtmlTokenId::EMBED
:
1683 case HtmlTokenId::HEAD1_ON
:
1684 case HtmlTokenId::HEAD1_OFF
:
1685 case HtmlTokenId::HEAD2_ON
:
1686 case HtmlTokenId::HEAD2_OFF
:
1687 case HtmlTokenId::HEAD3_ON
:
1688 case HtmlTokenId::HEAD3_OFF
:
1689 case HtmlTokenId::HEAD4_ON
:
1690 case HtmlTokenId::HEAD4_OFF
:
1691 case HtmlTokenId::HEAD5_ON
:
1692 case HtmlTokenId::HEAD5_OFF
:
1693 case HtmlTokenId::HEAD6_ON
:
1694 case HtmlTokenId::HEAD6_OFF
:
1695 case HtmlTokenId::BLOCKQUOTE_ON
:
1696 case HtmlTokenId::BLOCKQUOTE_OFF
:
1697 case HtmlTokenId::ADDRESS_ON
:
1698 case HtmlTokenId::ADDRESS_OFF
:
1699 case HtmlTokenId::HORZRULE
:
1701 case HtmlTokenId::CENTER_ON
:
1702 case HtmlTokenId::CENTER_OFF
:
1703 case HtmlTokenId::DIVISION_ON
:
1704 case HtmlTokenId::DIVISION_OFF
:
1706 case HtmlTokenId::SCRIPT_ON
:
1707 case HtmlTokenId::SCRIPT_OFF
:
1708 case HtmlTokenId::RAWDATA
:
1710 case HtmlTokenId::TABLE_ON
:
1711 case HtmlTokenId::TABLE_OFF
:
1712 case HtmlTokenId::CAPTION_ON
:
1713 case HtmlTokenId::CAPTION_OFF
:
1714 case HtmlTokenId::COLGROUP_ON
:
1715 case HtmlTokenId::COLGROUP_OFF
:
1716 case HtmlTokenId::COL_ON
:
1717 case HtmlTokenId::COL_OFF
:
1718 case HtmlTokenId::THEAD_ON
:
1719 case HtmlTokenId::THEAD_OFF
:
1720 case HtmlTokenId::TFOOT_ON
:
1721 case HtmlTokenId::TFOOT_OFF
:
1722 case HtmlTokenId::TBODY_ON
:
1723 case HtmlTokenId::TBODY_OFF
:
1724 case HtmlTokenId::TABLEROW_ON
:
1725 case HtmlTokenId::TABLEROW_OFF
:
1726 case HtmlTokenId::TABLEDATA_ON
:
1727 case HtmlTokenId::TABLEDATA_OFF
:
1728 case HtmlTokenId::TABLEHEADER_ON
:
1729 case HtmlTokenId::TABLEHEADER_OFF
:
1731 case HtmlTokenId::ANCHOR_ON
:
1732 case HtmlTokenId::ANCHOR_OFF
:
1733 case HtmlTokenId::BOLD_ON
:
1734 case HtmlTokenId::BOLD_OFF
:
1735 case HtmlTokenId::ITALIC_ON
:
1736 case HtmlTokenId::ITALIC_OFF
:
1737 case HtmlTokenId::STRIKE_ON
:
1738 case HtmlTokenId::STRIKE_OFF
:
1739 case HtmlTokenId::STRIKETHROUGH_ON
:
1740 case HtmlTokenId::STRIKETHROUGH_OFF
:
1741 case HtmlTokenId::UNDERLINE_ON
:
1742 case HtmlTokenId::UNDERLINE_OFF
:
1743 case HtmlTokenId::BASEFONT_ON
:
1744 case HtmlTokenId::BASEFONT_OFF
:
1745 case HtmlTokenId::FONT_ON
:
1746 case HtmlTokenId::FONT_OFF
:
1747 case HtmlTokenId::BLINK_ON
:
1748 case HtmlTokenId::BLINK_OFF
:
1749 case HtmlTokenId::SPAN_ON
:
1750 case HtmlTokenId::SPAN_OFF
:
1751 case HtmlTokenId::SUBSCRIPT_ON
:
1752 case HtmlTokenId::SUBSCRIPT_OFF
:
1753 case HtmlTokenId::SUPERSCRIPT_ON
:
1754 case HtmlTokenId::SUPERSCRIPT_OFF
:
1755 case HtmlTokenId::BIGPRINT_ON
:
1756 case HtmlTokenId::BIGPRINT_OFF
:
1757 case HtmlTokenId::SMALLPRINT_OFF
:
1758 case HtmlTokenId::SMALLPRINT_ON
:
1760 case HtmlTokenId::EMPHASIS_ON
:
1761 case HtmlTokenId::EMPHASIS_OFF
:
1762 case HtmlTokenId::CITATION_ON
:
1763 case HtmlTokenId::CITATION_OFF
:
1764 case HtmlTokenId::STRONG_ON
:
1765 case HtmlTokenId::STRONG_OFF
:
1766 case HtmlTokenId::CODE_ON
:
1767 case HtmlTokenId::CODE_OFF
:
1768 case HtmlTokenId::SAMPLE_ON
:
1769 case HtmlTokenId::SAMPLE_OFF
:
1770 case HtmlTokenId::KEYBOARD_ON
:
1771 case HtmlTokenId::KEYBOARD_OFF
:
1772 case HtmlTokenId::VARIABLE_ON
:
1773 case HtmlTokenId::VARIABLE_OFF
:
1774 case HtmlTokenId::DEFINSTANCE_ON
:
1775 case HtmlTokenId::DEFINSTANCE_OFF
:
1776 case HtmlTokenId::SHORTQUOTE_ON
:
1777 case HtmlTokenId::SHORTQUOTE_OFF
:
1778 case HtmlTokenId::LANGUAGE_ON
:
1779 case HtmlTokenId::LANGUAGE_OFF
:
1780 case HtmlTokenId::AUTHOR_ON
:
1781 case HtmlTokenId::AUTHOR_OFF
:
1782 case HtmlTokenId::PERSON_ON
:
1783 case HtmlTokenId::PERSON_OFF
:
1784 case HtmlTokenId::ACRONYM_ON
:
1785 case HtmlTokenId::ACRONYM_OFF
:
1786 case HtmlTokenId::ABBREVIATION_ON
:
1787 case HtmlTokenId::ABBREVIATION_OFF
:
1788 case HtmlTokenId::INSERTEDTEXT_ON
:
1789 case HtmlTokenId::INSERTEDTEXT_OFF
:
1790 case HtmlTokenId::DELETEDTEXT_ON
:
1791 case HtmlTokenId::DELETEDTEXT_OFF
:
1792 case HtmlTokenId::TELETYPE_ON
:
1793 case HtmlTokenId::TELETYPE_OFF
:
1797 // The remainder is treated as an unknown token.
1799 if( nToken
!= HtmlTokenId::NONE
)
1802 ( ((nToken
>= HtmlTokenId::ONOFF_START
) && isOffToken(nToken
))
1803 ? HtmlTokenId::UNKNOWNCONTROL_OFF
1804 : HtmlTokenId::UNKNOWNCONTROL_ON
);
1809 bPre_IgnoreNewPara
= false;
1814 HtmlTokenId
HTMLParser::FilterXMP( HtmlTokenId nToken
)
1818 case HtmlTokenId::NEWPARA
:
1819 if( bPre_IgnoreNewPara
)
1820 nToken
= HtmlTokenId::NONE
;
1822 case HtmlTokenId::TEXTTOKEN
:
1823 case HtmlTokenId::NONBREAKSPACE
:
1824 case HtmlTokenId::SOFTHYPH
:
1828 if( nToken
!= HtmlTokenId::NONE
)
1830 if( (nToken
>= HtmlTokenId::ONOFF_START
) && isOffToken(nToken
) )
1832 sSaveToken
= "</" + sSaveToken
;
1835 sSaveToken
= "<" + sSaveToken
;
1836 if( !aToken
.isEmpty() )
1840 aToken
.insert(0, sSaveToken
);
1843 aToken
= sSaveToken
;
1844 aToken
.append( ">" );
1845 nToken
= HtmlTokenId::TEXTTOKEN
;
1850 bPre_IgnoreNewPara
= false;
1855 HtmlTokenId
HTMLParser::FilterListing( HtmlTokenId nToken
)
1859 case HtmlTokenId::NEWPARA
:
1860 if( bPre_IgnoreNewPara
)
1861 nToken
= HtmlTokenId::NONE
;
1863 case HtmlTokenId::TEXTTOKEN
:
1864 case HtmlTokenId::NONBREAKSPACE
:
1865 case HtmlTokenId::SOFTHYPH
:
1869 if( nToken
!= HtmlTokenId::NONE
)
1872 ( ((nToken
>= HtmlTokenId::ONOFF_START
) && isOffToken(nToken
))
1873 ? HtmlTokenId::UNKNOWNCONTROL_OFF
1874 : HtmlTokenId::UNKNOWNCONTROL_ON
);
1879 bPre_IgnoreNewPara
= false;
1884 bool HTMLParser::InternalImgToPrivateURL( OUString
& rURL
)
1886 bool bFound
= false;
1888 if( rURL
.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon
) )
1890 OUString
aName( rURL
.copy(14) );
1894 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata
;
1897 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed
;
1900 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_embed
;
1903 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure
;
1906 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound
;
1912 OUString
sTmp ( rURL
);
1913 rURL
= OOO_STRING_SVTOOLS_HTML_private_image
;
1922 enum class HtmlMeta
{
1941 HTMLOptionEnum
<HtmlMeta
> const aHTMLMetaNameTable
[] =
1943 { OOO_STRING_SVTOOLS_HTML_META_author
, HtmlMeta::Author
},
1944 { OOO_STRING_SVTOOLS_HTML_META_changed
, HtmlMeta::Changed
},
1945 { OOO_STRING_SVTOOLS_HTML_META_changedby
, HtmlMeta::ChangedBy
},
1946 { OOO_STRING_SVTOOLS_HTML_META_classification
,HtmlMeta::Classification
},
1947 { OOO_STRING_SVTOOLS_HTML_META_content_type
, HtmlMeta::ContentType
},
1948 { OOO_STRING_SVTOOLS_HTML_META_created
, HtmlMeta::Created
},
1949 { OOO_STRING_SVTOOLS_HTML_META_description
, HtmlMeta::Description
},
1950 { OOO_STRING_SVTOOLS_HTML_META_keywords
, HtmlMeta::Keywords
},
1951 { OOO_STRING_SVTOOLS_HTML_META_generator
, HtmlMeta::Generator
},
1952 { OOO_STRING_SVTOOLS_HTML_META_refresh
, HtmlMeta::Refresh
},
1953 { OOO_STRING_SVTOOLS_HTML_META_sdendnote
, HtmlMeta::SDEndnote
},
1954 { OOO_STRING_SVTOOLS_HTML_META_sdfootnote
, HtmlMeta::SDFootnote
},
1955 { nullptr, HtmlMeta(0) }
1959 void HTMLParser::AddMetaUserDefined( OUString
const & )
1963 bool HTMLParser::ParseMetaOptionsImpl(
1964 const uno::Reference
<document::XDocumentProperties
> & i_xDocProps
,
1965 SvKeyValueIterator
*i_pHTTPHeader
,
1966 const HTMLOptions
& aOptions
,
1967 rtl_TextEncoding
& o_rEnc
)
1969 OUString aName
, aContent
;
1970 HtmlMeta nAction
= HtmlMeta::NONE
;
1971 bool bHTTPEquiv
= false, bChanged
= false;
1973 for ( size_t i
= aOptions
.size(); i
; )
1975 const HTMLOption
& aOption
= aOptions
[--i
];
1976 switch ( aOption
.GetToken() )
1978 case HtmlOptionId::NAME
:
1979 aName
= aOption
.GetString();
1980 if ( HtmlMeta::NONE
==nAction
)
1982 aOption
.GetEnum( nAction
, aHTMLMetaNameTable
);
1985 case HtmlOptionId::HTTPEQUIV
:
1986 aName
= aOption
.GetString();
1987 aOption
.GetEnum( nAction
, aHTMLMetaNameTable
);
1990 case HtmlOptionId::CONTENT
:
1991 aContent
= aOption
.GetString();
1993 case HtmlOptionId::CHARSET
:
1995 OString
sValue(OUStringToOString(aOption
.GetString(), RTL_TEXTENCODING_ASCII_US
));
1996 o_rEnc
= GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue
.getStr()));
2003 if ( bHTTPEquiv
|| HtmlMeta::Description
!= nAction
)
2005 // if it is not a Description, remove CRs and LFs from CONTENT
2006 aContent
= aContent
.replaceAll("\r", "").replaceAll("\n", "");
2010 // convert line endings for Description
2011 aContent
= convertLineEnd(aContent
, GetSystemLineEnd());
2014 if ( bHTTPEquiv
&& i_pHTTPHeader
)
2016 // Netscape seems to just ignore a closing ", so we do too
2017 if ( aContent
.endsWith("\"") )
2019 aContent
= aContent
.copy( 0, aContent
.getLength() - 1 );
2021 SvKeyValue
aKeyValue( aName
, aContent
);
2022 i_pHTTPHeader
->Append( aKeyValue
);
2027 case HtmlMeta::Author
:
2028 if (i_xDocProps
.is()) {
2029 i_xDocProps
->setAuthor( aContent
);
2033 case HtmlMeta::Description
:
2034 if (i_xDocProps
.is()) {
2035 i_xDocProps
->setDescription( aContent
);
2039 case HtmlMeta::Keywords
:
2040 if (i_xDocProps
.is()) {
2041 i_xDocProps
->setKeywords(
2042 ::comphelper::string::convertCommaSeparated(aContent
));
2046 case HtmlMeta::Classification
:
2047 if (i_xDocProps
.is()) {
2048 i_xDocProps
->setSubject( aContent
);
2053 case HtmlMeta::ChangedBy
:
2054 if (i_xDocProps
.is()) {
2055 i_xDocProps
->setModifiedBy( aContent
);
2060 case HtmlMeta::Created
:
2061 case HtmlMeta::Changed
:
2062 if (i_xDocProps
.is() && !aContent
.isEmpty())
2064 ::util::DateTime uDT
;
2066 if (comphelper::string::getTokenCount(aContent
, ';') == 2)
2068 sal_Int32 nIdx
{ 0 };
2069 Date
aDate(o3tl::toInt32(o3tl::getToken(aContent
, 0, ';', nIdx
)));
2070 auto nTime
= o3tl::toInt64(o3tl::getToken(aContent
, 0, ';', nIdx
));
2072 nTime
= o3tl::saturating_toggle_sign(nTime
);
2073 tools::Time
aTime(nTime
);
2074 DateTime
aDateTime(aDate
, aTime
);
2075 uDT
= aDateTime
.GetUNODateTime();
2078 else if (utl::ISO8601parseDateTime(aContent
, uDT
))
2084 if (HtmlMeta::Created
== nAction
)
2085 i_xDocProps
->setCreationDate(uDT
);
2087 i_xDocProps
->setModificationDate(uDT
);
2092 case HtmlMeta::Refresh
:
2093 DBG_ASSERT( !bHTTPEquiv
|| i_pHTTPHeader
, "Lost Reload-URL because of omitted MUST change." );
2096 case HtmlMeta::ContentType
:
2097 if ( !aContent
.isEmpty() )
2099 o_rEnc
= GetEncodingByMIME( aContent
);
2103 case HtmlMeta::NONE
:
2106 if (i_xDocProps
.is())
2108 uno::Reference
<beans::XPropertyContainer
> xUDProps
2109 = i_xDocProps
->getUserDefinedProperties();
2111 xUDProps
->addProperty(aName
,
2112 beans::PropertyAttribute::REMOVABLE
,
2113 uno::Any(aContent
));
2114 AddMetaUserDefined(aName
);
2116 } catch (uno::Exception
&) {
2129 bool HTMLParser::ParseMetaOptions(
2130 const uno::Reference
<document::XDocumentProperties
> & i_xDocProps
,
2131 SvKeyValueIterator
*i_pHeader
)
2133 HtmlOptionId nContentOption
= HtmlOptionId::CONTENT
;
2134 rtl_TextEncoding eEnc
= RTL_TEXTENCODING_DONTKNOW
;
2136 bool bRet
= ParseMetaOptionsImpl( i_xDocProps
, i_pHeader
,
2137 GetOptions(&nContentOption
),
2140 // If the encoding is set by a META tag, it may only overwrite the
2141 // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2142 // encodings. Everything else cannot lead to reasonable results.
2143 if (RTL_TEXTENCODING_DONTKNOW
!= eEnc
&&
2144 rtl_isOctetTextEncoding( eEnc
) &&
2145 rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2147 eEnc
= GetExtendedCompatibilityTextEncoding( eEnc
);
2148 SetSrcEncoding( eEnc
);
2154 rtl_TextEncoding
HTMLParser::GetEncodingByMIME( const OUString
& rMime
)
2158 INetContentTypeParameterList aParameters
;
2159 if (INetContentTypes::parse(rMime
, sType
, sSubType
, &aParameters
))
2161 auto const iter
= aParameters
.find("charset");
2162 if (iter
!= aParameters
.end())
2164 const INetContentTypeParameter
* pCharset
= &iter
->second
;
2165 OString
sValue(OUStringToOString(pCharset
->m_sValue
, RTL_TEXTENCODING_ASCII_US
));
2166 return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue
.getStr() ) );
2169 return RTL_TEXTENCODING_DONTKNOW
;
2172 rtl_TextEncoding
HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator
*pHTTPHeader
)
2174 rtl_TextEncoding eRet
= RTL_TEXTENCODING_DONTKNOW
;
2178 for( bool bCont
= pHTTPHeader
->GetFirst( aKV
); bCont
;
2179 bCont
= pHTTPHeader
->GetNext( aKV
) )
2181 if( aKV
.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type
) )
2183 if( !aKV
.GetValue().isEmpty() )
2185 eRet
= HTMLParser::GetEncodingByMIME( aKV
.GetValue() );
2193 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator
*pHTTPHeader
)
2196 rtl_TextEncoding eEnc
= HTMLParser::GetEncodingByHttpHeader( pHTTPHeader
);
2197 if(RTL_TEXTENCODING_DONTKNOW
!= eEnc
)
2199 SetSrcEncoding( eEnc
);
2206 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */