1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include <comphelper/string.hxx>
21 #include <o3tl/safeint.hxx>
22 #include <o3tl/string_view.hxx>
23 #include <tools/stream.hxx>
24 #include <tools/debug.hxx>
25 #include <tools/color.hxx>
26 #include <rtl/ustrbuf.hxx>
27 #include <rtl/character.hxx>
28 #include <rtl/tencinfo.h>
29 #include <sal/log.hxx>
30 #include <tools/tenccvt.hxx>
31 #include <tools/datetime.hxx>
32 #include <unotools/datetime.hxx>
33 #include <svl/inettype.hxx>
34 #include <svl/lngmisc.hxx>
35 #include <com/sun/star/beans/PropertyAttribute.hpp>
36 #include <com/sun/star/document/XDocumentProperties.hpp>
38 #include <svtools/parhtml.hxx>
39 #include <svtools/htmltokn.h>
40 #include <svtools/htmlkywd.hxx>
44 using namespace ::com::sun::star
;
47 const sal_Int32
MAX_LEN( 1024 );
49 const sal_Int32
MAX_ENTITY_LEN( 8 );
52 // Tables to convert option values into strings
55 HTMLOptionEnum
<HTMLInputType
> const aInputTypeOptEnums
[] =
57 { OOO_STRING_SVTOOLS_HTML_IT_text
, HTMLInputType::Text
},
58 { OOO_STRING_SVTOOLS_HTML_IT_password
, HTMLInputType::Password
},
59 { OOO_STRING_SVTOOLS_HTML_IT_checkbox
, HTMLInputType::Checkbox
},
60 { OOO_STRING_SVTOOLS_HTML_IT_radio
, HTMLInputType::Radio
},
61 { OOO_STRING_SVTOOLS_HTML_IT_range
, HTMLInputType::Range
},
62 { OOO_STRING_SVTOOLS_HTML_IT_scribble
, HTMLInputType::Scribble
},
63 { OOO_STRING_SVTOOLS_HTML_IT_file
, HTMLInputType::File
},
64 { OOO_STRING_SVTOOLS_HTML_IT_hidden
, HTMLInputType::Hidden
},
65 { OOO_STRING_SVTOOLS_HTML_IT_submit
, HTMLInputType::Submit
},
66 { OOO_STRING_SVTOOLS_HTML_IT_image
, HTMLInputType::Image
},
67 { OOO_STRING_SVTOOLS_HTML_IT_reset
, HTMLInputType::Reset
},
68 { OOO_STRING_SVTOOLS_HTML_IT_button
, HTMLInputType::Button
},
69 { nullptr, HTMLInputType(0) }
73 HTMLOptionEnum
<HTMLTableFrame
> const aTableFrameOptEnums
[] =
75 { OOO_STRING_SVTOOLS_HTML_TF_void
, HTMLTableFrame::Void
},
76 { OOO_STRING_SVTOOLS_HTML_TF_above
, HTMLTableFrame::Above
},
77 { OOO_STRING_SVTOOLS_HTML_TF_below
, HTMLTableFrame::Below
},
78 { OOO_STRING_SVTOOLS_HTML_TF_hsides
, HTMLTableFrame::HSides
},
79 { OOO_STRING_SVTOOLS_HTML_TF_lhs
, HTMLTableFrame::LHS
},
80 { OOO_STRING_SVTOOLS_HTML_TF_rhs
, HTMLTableFrame::RHS
},
81 { OOO_STRING_SVTOOLS_HTML_TF_vsides
, HTMLTableFrame::VSides
},
82 { OOO_STRING_SVTOOLS_HTML_TF_box
, HTMLTableFrame::Box
},
83 { OOO_STRING_SVTOOLS_HTML_TF_border
, HTMLTableFrame::Box
},
84 { nullptr, HTMLTableFrame(0) }
88 HTMLOptionEnum
<HTMLTableRules
> const aTableRulesOptEnums
[] =
90 { OOO_STRING_SVTOOLS_HTML_TR_none
, HTMLTableRules::NONE
},
91 { OOO_STRING_SVTOOLS_HTML_TR_groups
, HTMLTableRules::Groups
},
92 { OOO_STRING_SVTOOLS_HTML_TR_rows
, HTMLTableRules::Rows
},
93 { OOO_STRING_SVTOOLS_HTML_TR_cols
, HTMLTableRules::Cols
},
94 { OOO_STRING_SVTOOLS_HTML_TR_all
, HTMLTableRules::All
},
95 { nullptr, HTMLTableRules(0) }
99 HTMLOption::HTMLOption( HtmlOptionId nTok
, OUString _aToken
,
101 : aValue(std::move(_aValue
))
102 , aToken(std::move(_aToken
))
105 DBG_ASSERT( nToken
>=HtmlOptionId::BOOL_START
&& nToken
<HtmlOptionId::END
,
106 "HTMLOption: unknown token" );
109 sal_uInt32
HTMLOption::GetNumber() const
111 DBG_ASSERT( (nToken
>=HtmlOptionId::NUMBER_START
&&
112 nToken
<HtmlOptionId::NUMBER_END
) ||
113 (nToken
>=HtmlOptionId::CONTEXT_START
&&
114 nToken
<HtmlOptionId::CONTEXT_END
) ||
115 nToken
==HtmlOptionId::VALUE
,
116 "GetNumber: Option not numerical" );
117 OUString
aTmp(comphelper::string::stripStart(aValue
, ' '));
118 sal_Int32 nTmp
= aTmp
.toInt32();
119 return nTmp
>= 0 ? static_cast<sal_uInt32
>(nTmp
) : 0;
122 sal_Int32
HTMLOption::GetSNumber() const
124 DBG_ASSERT( (nToken
>=HtmlOptionId::NUMBER_START
&& nToken
<HtmlOptionId::NUMBER_END
) ||
125 (nToken
>=HtmlOptionId::CONTEXT_START
&& nToken
<HtmlOptionId::CONTEXT_END
),
126 "GetSNumber: Option not numerical" );
127 OUString
aTmp(comphelper::string::stripStart(aValue
, ' '));
128 return aTmp
.toInt32();
131 void HTMLOption::GetNumbers( std::vector
<sal_uInt32
> &rNumbers
) const
135 // This is a very simplified scanner: it only searches all
136 // numerals in the string.
139 for( sal_Int32 i
=0; i
<aValue
.getLength(); i
++ )
141 sal_Unicode c
= aValue
[ i
];
142 if( c
>='0' && c
<='9' )
150 rNumbers
.push_back( nNum
);
157 rNumbers
.push_back( nNum
);
161 void HTMLOption::GetColor( Color
& rColor
) const
163 DBG_ASSERT( (nToken
>=HtmlOptionId::COLOR_START
&& nToken
<HtmlOptionId::COLOR_END
) || nToken
==HtmlOptionId::SIZE
,
164 "GetColor: Option is not a color." );
166 OUString
aTmp(aValue
.toAsciiLowerCase());
167 sal_uInt32 nColor
= SAL_MAX_UINT32
;
168 if (!aTmp
.isEmpty() && aTmp
[0] != '#')
169 nColor
= GetHTMLColor(aTmp
);
171 if( SAL_MAX_UINT32
== nColor
)
175 for (sal_uInt32 i
=0; i
<6; ++i
)
177 // Whatever Netscape does to get color values,
178 // at maximum three characters < '0' are ignored.
179 sal_Unicode c
= nPos
<aTmp
.getLength() ? aTmp
[ nPos
++ ] : '0';
182 c
= nPos
<aTmp
.getLength() ? aTmp
[nPos
++] : '0';
184 c
= nPos
<aTmp
.getLength() ? aTmp
[nPos
++] : '0';
187 if( c
>= '0' && c
<= '9' )
189 else if( c
>= 'a' && c
<= 'f' )
190 nColor
+= (c
+ 0xa - 'a');
194 rColor
.SetRed( static_cast<sal_uInt8
>((nColor
& 0x00ff0000) >> 16) );
195 rColor
.SetGreen( static_cast<sal_uInt8
>((nColor
& 0x0000ff00) >> 8));
196 rColor
.SetBlue( static_cast<sal_uInt8
>(nColor
& 0x000000ff) );
199 HTMLInputType
HTMLOption::GetInputType() const
201 DBG_ASSERT( nToken
==HtmlOptionId::TYPE
, "GetInputType: Option not TYPE" );
202 return GetEnum( aInputTypeOptEnums
, HTMLInputType::Text
);
205 HTMLTableFrame
HTMLOption::GetTableFrame() const
207 DBG_ASSERT( nToken
==HtmlOptionId::FRAME
, "GetTableFrame: Option not FRAME" );
208 return GetEnum( aTableFrameOptEnums
);
211 HTMLTableRules
HTMLOption::GetTableRules() const
213 DBG_ASSERT( nToken
==HtmlOptionId::RULES
, "GetTableRules: Option not RULES" );
214 return GetEnum( aTableRulesOptEnums
);
217 HTMLParser::HTMLParser( SvStream
& rIn
, bool bReadNewDoc
) :
218 SvParser
<HtmlTokenId
>( rIn
),
219 bNewDoc(bReadNewDoc
),
224 bReadTextArea(false),
227 bEndTokenFound(false),
228 bPre_IgnoreNewPara(false),
229 bReadNextChar(false),
232 mnPendingOffToken(HtmlTokenId::NONE
)
234 //#i76649, default to UTF-8 for HTML unless we know differently
235 SetSrcEncoding(RTL_TEXTENCODING_UTF8
);
238 HTMLParser::~HTMLParser()
242 void HTMLParser::SetNamespace(std::u16string_view rNamespace
)
244 // Convert namespace alias to a prefix.
245 maNamespace
= OUString::Concat(rNamespace
) + ":";
253 HTMLParser
& m_rParser
;
255 RefGuard(HTMLParser
& rParser
)
258 m_rParser
.AddFirstRef();
263 if (m_rParser
.GetStatus() != SvParserState::Pending
)
264 m_rParser
.ReleaseRef(); // Parser not needed anymore
269 SvParserState
HTMLParser::CallParser()
271 eState
= SvParserState::Working
;
272 nNextCh
= GetNextChar();
273 SaveState( HtmlTokenId::NONE
);
276 bPre_IgnoreNewPara
= false;
278 RefGuard
aRefGuard(*this);
280 Continue( HtmlTokenId::NONE
);
285 void HTMLParser::Continue( HtmlTokenId nToken
)
287 if( nToken
== HtmlTokenId::NONE
)
288 nToken
= GetNextToken();
290 while( IsParserWorking() )
293 nToken
= FilterToken( nToken
);
295 if( nToken
!= HtmlTokenId::NONE
)
298 if( IsParserWorking() )
299 SaveState( HtmlTokenId::NONE
); // continue with new token
301 nToken
= GetNextToken();
305 HtmlTokenId
HTMLParser::FilterToken( HtmlTokenId nToken
)
309 case HtmlTokenId(EOF
):
310 nToken
= HtmlTokenId::NONE
;
313 case HtmlTokenId::HEAD_OFF
:
317 case HtmlTokenId::HEAD_ON
:
321 case HtmlTokenId::BODY_ON
:
325 case HtmlTokenId::FRAMESET_ON
:
329 case HtmlTokenId::BODY_OFF
:
330 bReadPRE
= bReadListing
= bReadXMP
= false;
333 case HtmlTokenId::HTML_OFF
:
334 nToken
= HtmlTokenId::NONE
;
335 bReadPRE
= bReadListing
= bReadXMP
= false;
336 break; // HtmlTokenId::ON hasn't been passed either !
338 case HtmlTokenId::PREFORMTXT_ON
:
342 case HtmlTokenId::PREFORMTXT_OFF
:
346 case HtmlTokenId::LISTING_ON
:
350 case HtmlTokenId::LISTING_OFF
:
354 case HtmlTokenId::XMP_ON
:
358 case HtmlTokenId::XMP_OFF
:
364 nToken
= FilterPRE( nToken
);
365 else if( bReadListing
)
366 nToken
= FilterListing( nToken
);
368 nToken
= FilterXMP( nToken
);
378 constexpr bool HTML_ISPRINTABLE(sal_Unicode c
) { return c
>= 32 && c
!= 127; }
380 constexpr bool HTML_ISSPACE(sal_uInt32 c
)
382 return ' ' == c
|| '\t' == c
|| '\r' == c
|| '\n' == c
|| '\x0b' == c
;
387 HtmlTokenId
HTMLParser::ScanText(const sal_Unicode cBreak
)
389 OUStringBuffer
sTmpBuffer( MAX_LEN
);
390 bool bContinue
= true;
391 bool bEqSignFound
= false;
392 sal_uInt32 cQuote
= 0U;
394 while( bContinue
&& IsParserWorking() )
400 bEqSignFound
= false;
402 sTmpBuffer
.append( '&' );
405 sal_uInt64 nStreamPos
= rInput
.Tell();
406 sal_uInt32 nLinePos
= GetLinePos();
408 sal_uInt32 cChar
= 0U;
409 if( '#' == (nNextCh
= GetNextChar()) )
411 nNextCh
= GetNextChar();
412 const bool bIsHex( 'x' == nNextCh
);
413 const bool bIsDecOrHex( bIsHex
|| rtl::isAsciiDigit(nNextCh
) );
418 nNextCh
= GetNextChar();
419 while ( rtl::isAsciiHexDigit(nNextCh
) )
421 cChar
= cChar
* 16U +
423 ? sal_uInt32( nNextCh
- '0' )
425 ? sal_uInt32( nNextCh
- 'A' + 10 )
426 : sal_uInt32( nNextCh
- 'a' + 10 ) ) );
427 nNextCh
= GetNextChar();
434 cChar
= cChar
* 10U + sal_uInt32( nNextCh
- '0');
435 nNextCh
= GetNextChar();
437 while( rtl::isAsciiDigit(nNextCh
) );
440 if( RTL_TEXTENCODING_DONTKNOW
!= eSrcEnc
&&
441 RTL_TEXTENCODING_UCS2
!= eSrcEnc
&&
442 RTL_TEXTENCODING_UTF8
!= eSrcEnc
&&
445 const sal_uInt32 convertFlags
=
446 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
|
447 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
|
448 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
;
450 char cEncodedChar
= static_cast<char>(cChar
);
451 cChar
= OUString(&cEncodedChar
, 1, eSrcEnc
, convertFlags
).toChar();
454 // If the character could not be
455 // converted, because a conversion is not
456 // available, do no conversion at all.
457 cChar
= cEncodedChar
;
464 if (!rtl::isUnicodeCodePoint(cChar
)
465 || (linguistic::IsControlChar(cChar
)
466 && cChar
!= '\r' && cChar
!= '\n' && cChar
!= '\t'))
471 else if( rtl::isAsciiAlpha( nNextCh
) )
473 OUStringBuffer
sEntityBuffer( MAX_ENTITY_LEN
);
477 sEntityBuffer
.appendUtf32( nNextCh
);
479 nNextCh
= GetNextChar();
481 while( nPos
< MAX_ENTITY_LEN
&& rtl::isAsciiAlphanumeric( nNextCh
) &&
484 if( IsParserWorking() && !rInput
.eof() )
486 std::u16string_view
sEntity(sEntityBuffer
.subView(0, nPos
));
487 cChar
= GetHTMLCharName( sEntity
);
489 // not found ( == 0 ): plain text
490 // or a character which is inserted as attribute
491 if( 0U == cChar
&& ';' != nNextCh
)
493 DBG_ASSERT( rInput
.Tell() - nStreamPos
==
494 static_cast<sal_uInt64
>(nPos
+1)*GetCharSize(),
495 "UTF-8 is failing here" );
496 for( sal_Int32 i
= nPos
-1; i
>1; i
-- )
498 nNextCh
= sEntityBuffer
[i
];
499 sEntityBuffer
.setLength( i
);
500 sEntity
= sEntityBuffer
.subView(0, i
);
501 cChar
= GetHTMLCharName( sEntity
);
504 rInput
.SeekRel( -static_cast<sal_Int64
>
505 (nPos
-i
)*GetCharSize() );
506 nlLinePos
-= sal_uInt32(nPos
-i
);
508 ClearTxtConvContext();
514 if( !cChar
) // unknown character?
516 // back in stream, insert '&'
517 // and restart with next character
518 sTmpBuffer
.append( '&' );
520 DBG_ASSERT( rInput
.Tell()-nStreamPos
==
521 static_cast<sal_uInt64
>(nPos
+1)*GetCharSize(),
522 "Wrong stream position" );
523 DBG_ASSERT( nlLinePos
-nLinePos
==
524 static_cast<sal_uInt32
>(nPos
+1),
525 "Wrong line position" );
526 rInput
.Seek( nStreamPos
);
527 nlLinePos
= nLinePos
;
528 ClearTxtConvContext();
534 // 1 == Non Breaking Space
537 if (cChar
== 1 || cChar
== 2)
541 // When reading the content of a tag we have
542 // to change it to ' ' or '-'
550 // If not scanning a tag return token
551 aToken
.append( sTmpBuffer
);
552 sTmpBuffer
.setLength(0);
554 if( !aToken
.isEmpty() )
556 // restart with character
558 DBG_ASSERT( rInput
.Tell()-nStreamPos
==
559 static_cast<sal_uInt64
>(nPos
+1)*GetCharSize(),
560 "Wrong stream position" );
561 DBG_ASSERT( nlLinePos
-nLinePos
==
562 static_cast<sal_uInt32
>(nPos
+1),
563 "Wrong line position" );
564 rInput
.Seek( nStreamPos
);
565 nlLinePos
= nLinePos
;
566 ClearTxtConvContext();
567 return HtmlTokenId::TEXTTOKEN
;
570 // Hack: _GetNextChar shall not read the
573 aToken
.append( " " );
575 return HtmlTokenId::NONBREAKSPACE
;
577 return HtmlTokenId::SOFTHYPH
;
584 // &{...};-JavaScript-Macros are not supported any longer.
585 else if( IsParserWorking() )
587 sTmpBuffer
.append( '&' );
592 bNextCh
= (';' == nNextCh
);
593 if( cBreak
=='>' && (cChar
=='\\' || cChar
=='\'' ||
594 cChar
=='\"' || cChar
==' ') )
596 // ' and " have to be escaped within tags to separate
597 // them from ' and " enclosing options.
598 // \ has to be escaped as well.
599 // Space is protected because it's not a delimiter between
601 sTmpBuffer
.append( '\\' );
603 if( IsParserWorking() )
606 sTmpBuffer
.appendUtf32( cChar
);
608 else if( SvParserState::Pending
==eState
&& '>'!=cBreak
)
610 // Restart with '&', the remainder is returned as
612 if( !aToken
.isEmpty() || !sTmpBuffer
.isEmpty() )
614 // _GetNextChar() returns the previous text and
615 // during the next execution a new character is read.
616 // Thus we have to position in front of the '&'.
618 rInput
.Seek( nStreamPos
- GetCharSize() );
619 nlLinePos
= nLinePos
-1;
620 ClearTxtConvContext();
621 bReadNextChar
= true;
628 if( '>'==cBreak
&& !cQuote
)
630 sTmpBuffer
.appendUtf32( nNextCh
);
637 sTmpBuffer
.append( '\\' );
639 sTmpBuffer
.append( '\\' );
648 else if( cQuote
&& (cQuote
==nNextCh
) )
651 sTmpBuffer
.appendUtf32( nNextCh
);
652 bEqSignFound
= false;
655 case sal_Unicode(EOF
):
660 // else: ignore, not a valid code point
664 bEqSignFound
= false;
666 sTmpBuffer
.appendUtf32( nNextCh
);
668 bContinue
= false; // break, string is together
674 // If scanning options treat it like a space, ...
675 sTmpBuffer
.append( ' ' );
679 // otherwise it's a separate token.
688 // cr/lf in tag is handled in GetNextToken_()
689 sTmpBuffer
.appendUtf32( nNextCh
);
692 else if( bReadListing
|| bReadXMP
|| bReadPRE
|| bReadTextArea
)
697 // Reduce sequence of CR/LF/BLANK/TAB to a single blank
700 if( '\t'==nNextCh
&& bReadPRE
&& '>'!=cBreak
)
702 // Pass Tabs up in <PRE>
708 if( '\x0b'==nNextCh
&& (bReadPRE
|| bReadXMP
||bReadListing
) &&
713 if (!m_bPreserveSpaces
)
717 if (!m_bPreserveSpaces
)
719 sTmpBuffer
.appendUtf32(nNextCh
);
720 if ('>' != cBreak
&& (!bReadListing
&& !bReadXMP
&& !bReadPRE
&& !bReadTextArea
))
722 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
725 nNextCh
= GetNextChar();
726 if (sal_Unicode(EOF
) == nNextCh
&& rInput
.eof())
728 if (!aToken
.isEmpty() || sTmpBuffer
.getLength() > 1)
730 // Have seen s.th. aside from blanks?
731 aToken
.append(sTmpBuffer
);
732 sTmpBuffer
.setLength(0);
733 return HtmlTokenId::TEXTTOKEN
;
736 // Only read blanks: no text must be returned
737 // and GetNextToken_ has to read until EOF
738 return HtmlTokenId::NONE
;
740 } while (HTML_ISSPACE(nNextCh
));
747 bEqSignFound
= false;
748 if (nNextCh
== cBreak
&& !cQuote
)
753 if (!linguistic::IsControlChar(nNextCh
) || HTML_ISSPACE(nNextCh
))
755 // All remaining characters make their way into the text.
756 sTmpBuffer
.appendUtf32( nNextCh
);
759 nNextCh
= GetNextChar();
760 if( ( sal_Unicode(EOF
) == nNextCh
&& rInput
.eof() ) ||
763 if( !sTmpBuffer
.isEmpty() )
764 aToken
.append( sTmpBuffer
);
765 return HtmlTokenId::TEXTTOKEN
;
767 } while( rtl::isAsciiAlpha( nNextCh
) || rtl::isAsciiDigit( nNextCh
) );
772 if( bContinue
&& bNextCh
)
773 nNextCh
= GetNextChar();
776 if( !sTmpBuffer
.isEmpty() )
777 aToken
.append( sTmpBuffer
);
779 return HtmlTokenId::TEXTTOKEN
;
782 HtmlTokenId
HTMLParser::GetNextRawToken()
784 OUStringBuffer
sTmpBuffer( MAX_LEN
);
788 // During the last execution we already found the end token,
789 // thus we don't have to search it again.
793 bEndTokenFound
= false;
795 return HtmlTokenId::NONE
;
798 // Default return value: HtmlTokenId::RAWDATA
799 bool bContinue
= true;
800 HtmlTokenId nToken
= HtmlTokenId::RAWDATA
;
801 SaveState( HtmlTokenId::NONE
);
802 while( bContinue
&& IsParserWorking() )
809 // Maybe we've reached the end.
811 // Save what we have read previously...
812 aToken
.append( sTmpBuffer
);
813 sTmpBuffer
.setLength(0);
815 // and remember position in stream.
816 sal_uInt64 nStreamPos
= rInput
.Tell();
817 sal_uInt32 nLineNr
= GetLineNr();
818 sal_uInt32 nLinePos
= GetLinePos();
820 // Start of an end token?
821 bool bOffState
= false;
822 if( '/' == (nNextCh
= GetNextChar()) )
825 nNextCh
= GetNextChar();
827 else if( '!' == nNextCh
)
829 sTmpBuffer
.appendUtf32( nNextCh
);
830 nNextCh
= GetNextChar();
833 // Read following letters
834 while( (rtl::isAsciiAlpha(nNextCh
) || '-'==nNextCh
) &&
835 IsParserWorking() && sTmpBuffer
.getLength() < MAX_LEN
)
837 sTmpBuffer
.appendUtf32( nNextCh
);
838 nNextCh
= GetNextChar();
841 OUString
aTok( sTmpBuffer
.toString() );
842 aTok
= aTok
.toAsciiLowerCase();
844 if( bReadScript
|| !aEndToken
.isEmpty() )
848 if( aTok
.startsWith( OOO_STRING_SVTOOLS_HTML_comment
) )
854 // A script has to end with "</SCRIPT>". But
855 // ">" is optional for security reasons
858 ? aTok
== OOO_STRING_SVTOOLS_HTML_script
859 : aTok
== aEndToken
);
862 if( bReadComment
&& '>'==nNextCh
&& aTok
.endsWith( "--" ) )
864 // End of comment of style <!----->
865 bReadComment
= false;
870 // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
872 bDone
= aTok
== OOO_STRING_SVTOOLS_HTML_style
||
873 aTok
== OOO_STRING_SVTOOLS_HTML_head
;
875 bDone
= aTok
== OOO_STRING_SVTOOLS_HTML_body
;
880 // Done! Return the previously read string (if requested)
885 // nToken==0 means, GetNextToken_ continues to read
886 if( aToken
.isEmpty() && (bReadStyle
|| bReadScript
) )
888 // Immediately close environment (or context?)
889 // and parse the end token
893 nToken
= HtmlTokenId::NONE
;
897 // Keep bReadScript/bReadStyle alive
898 // and parse end token during next execution
899 bEndTokenFound
= true;
902 // Move backwards in stream to '<'
903 rInput
.Seek( nStreamPos
);
904 SetLineNr( nLineNr
);
905 SetLinePos( nLinePos
);
906 ClearTxtConvContext();
909 // Don't append string to token.
910 sTmpBuffer
.setLength( 0 );
914 // remember "</" , everything else we find in the buffer
915 aToken
.append( "<" );
917 aToken
.append( "/" );
924 sTmpBuffer
.appendUtf32( nNextCh
);
927 bool bTwoMinus
= false;
928 nNextCh
= GetNextChar();
929 while( '-' == nNextCh
&& IsParserWorking() )
932 sTmpBuffer
.appendUtf32( nNextCh
);
933 nNextCh
= GetNextChar();
936 if( '>' == nNextCh
&& IsParserWorking() && bTwoMinus
)
937 bReadComment
= false;
944 // \r\n? closes the current text token (even if it's empty)
945 nNextCh
= GetNextChar();
947 nNextCh
= GetNextChar();
951 // \n closes the current text token (even if it's empty)
952 nNextCh
= GetNextChar();
955 case sal_Unicode(EOF
):
956 // eof closes the current text token and behaves like having read
961 if( !aToken
.isEmpty() || !sTmpBuffer
.isEmpty() )
963 bEndTokenFound
= true;
970 nToken
= HtmlTokenId::NONE
;
975 if (!linguistic::IsControlChar(nNextCh
) || nNextCh
== '\t')
977 // all remaining characters are appended to the buffer
978 sTmpBuffer
.appendUtf32( nNextCh
);
983 if( !bContinue
&& !sTmpBuffer
.isEmpty() )
985 aToken
.append( sTmpBuffer
);
986 sTmpBuffer
.setLength(0);
989 if( bContinue
&& bNextCh
)
990 nNextCh
= GetNextChar();
993 if( IsParserWorking() )
994 SaveState( HtmlTokenId::NONE
);
996 nToken
= HtmlTokenId::NONE
;
1002 HtmlTokenId
HTMLParser::GetNextToken_()
1004 HtmlTokenId nRet
= HtmlTokenId::NONE
;
1007 if (mnPendingOffToken
!= HtmlTokenId::NONE
)
1009 // HtmlTokenId::<TOKEN>_OFF generated for HtmlTokenId::<TOKEN>_ON
1010 nRet
= mnPendingOffToken
;
1011 mnPendingOffToken
= HtmlTokenId::NONE
;
1012 aToken
.setLength( 0 );
1019 if( !IsParserWorking() ) // Don't continue if already an error occurred
1020 return HtmlTokenId::NONE
;
1022 bool bReadNextCharSave
= bReadNextChar
;
1025 DBG_ASSERT( !bEndTokenFound
,
1026 "Read a character despite </SCRIPT> was read?" );
1027 nNextCh
= GetNextChar();
1028 if( !IsParserWorking() ) // Don't continue if already an error occurred
1029 return HtmlTokenId::NONE
;
1030 bReadNextChar
= false;
1033 if( bReadScript
|| bReadStyle
|| !aEndToken
.isEmpty() )
1035 nRet
= GetNextRawToken();
1036 if( nRet
!= HtmlTokenId::NONE
|| !IsParserWorking() )
1041 bool bNextCh
= true;
1046 sal_uInt64 nStreamPos
= rInput
.Tell();
1047 sal_uInt32 nLineNr
= GetLineNr();
1048 sal_uInt32 nLinePos
= GetLinePos();
1050 bool bOffState
= false;
1051 if( '/' == (nNextCh
= GetNextChar()) )
1054 nNextCh
= GetNextChar();
1056 // Assume '<?' is a start of an XML declaration, ignore it.
1057 if (rtl::isAsciiAlpha(nNextCh
) || nNextCh
== '!' || nNextCh
== '?')
1059 OUStringBuffer sTmpBuffer
;
1061 sTmpBuffer
.appendUtf32( nNextCh
);
1062 nNextCh
= GetNextChar();
1063 if (std::u16string_view(sTmpBuffer
) == u
"![CDATA[")
1065 if (bFuzzing
&& sTmpBuffer
.getLength() > 1024)
1067 SAL_WARN("svtools", "abandoning import for performance reasons with long tokens");
1068 eState
= SvParserState::Error
;
1071 } while( '>' != nNextCh
&& '/' != nNextCh
&& !rtl::isAsciiWhiteSpace( nNextCh
) &&
1072 !linguistic::IsControlChar(nNextCh
) &&
1073 IsParserWorking() && !rInput
.eof() );
1075 if( !sTmpBuffer
.isEmpty() )
1077 aToken
.append( sTmpBuffer
);
1078 sTmpBuffer
.setLength(0);
1082 while( rtl::isAsciiWhiteSpace( nNextCh
) && IsParserWorking() )
1083 nNextCh
= GetNextChar();
1085 if( !IsParserWorking() )
1087 if( SvParserState::Pending
== eState
)
1088 bReadNextChar
= bReadNextCharSave
;
1092 // Search token in table:
1093 sSaveToken
= aToken
;
1094 aToken
= aToken
.toString().toAsciiLowerCase();
1096 if (!maNamespace
.isEmpty() && o3tl::starts_with(aToken
, maNamespace
))
1097 aToken
.remove( 0, maNamespace
.getLength());
1099 if( HtmlTokenId::NONE
== (nRet
= GetHTMLToken( aToken
)) )
1101 nRet
= HtmlTokenId::UNKNOWNCONTROL_ON
;
1103 // If it's a token which can be switched off...
1106 if( nRet
>= HtmlTokenId::ONOFF_START
)
1108 // and there is an off token, return off token instead
1109 nRet
= static_cast<HtmlTokenId
>(static_cast<int>(nRet
) + 1);
1111 else if( HtmlTokenId::LINEBREAK
!=nRet
|| !maNamespace
.isEmpty())
1113 // and there is no off token, return unknown token.
1114 // (except for </BR>, that is treated like <BR>)
1115 // No exception for XHTML, though.
1116 nRet
= HtmlTokenId::UNKNOWNCONTROL_OFF
;
1120 if( nRet
== HtmlTokenId::COMMENT
)
1122 // fix: due to being case sensitive use sSaveToken as start of comment
1123 // and append a blank.
1124 aToken
= sSaveToken
;
1126 aToken
.append( " " );
1127 sal_uInt64 nCStreamPos
= 0;
1128 sal_uInt32 nCLineNr
= 0;
1129 sal_uInt32 nCLinePos
= 0;
1130 sal_Int32 nCStrLen
= 0;
1133 // Read until closing -->. If not found restart at first >
1134 sTmpBuffer
= aToken
;
1135 while( !bDone
&& !rInput
.eof() && IsParserWorking() )
1141 nCStreamPos
= rInput
.Tell();
1142 nCStrLen
= sTmpBuffer
.getLength();
1143 nCLineNr
= GetLineNr();
1144 nCLinePos
= GetLinePos();
1146 bDone
= sTmpBuffer
.getLength() >= 2 && sTmpBuffer
[sTmpBuffer
.getLength() - 2] == '-' && sTmpBuffer
[sTmpBuffer
.getLength() - 1] == '-';
1148 sTmpBuffer
.appendUtf32(nNextCh
);
1150 else if (!linguistic::IsControlChar(nNextCh
)
1151 || nNextCh
== '\r' || nNextCh
== '\n' || nNextCh
== '\t')
1153 sTmpBuffer
.appendUtf32(nNextCh
);
1156 nNextCh
= GetNextChar();
1158 aToken
= sTmpBuffer
;
1159 sTmpBuffer
.setLength(0);
1160 if( !bDone
&& IsParserWorking() && nCStreamPos
)
1162 rInput
.Seek( nCStreamPos
);
1163 SetLineNr( nCLineNr
);
1164 SetLinePos( nCLinePos
);
1165 ClearTxtConvContext();
1166 aToken
.truncate(nCStrLen
);
1170 else if (nRet
== HtmlTokenId::CDATA
)
1172 // Read until the closing ]]>.
1174 while (!bDone
&& !rInput
.eof() && IsParserWorking())
1178 if (sTmpBuffer
.getLength() >= 2)
1180 bDone
= sTmpBuffer
[sTmpBuffer
.getLength() - 2] == ']'
1181 && sTmpBuffer
[sTmpBuffer
.getLength() - 1] == ']';
1184 // Ignore ]] at the end.
1185 sTmpBuffer
.setLength(sTmpBuffer
.getLength() - 2);
1190 sTmpBuffer
.appendUtf32(nNextCh
);
1193 else if (!linguistic::IsControlChar(nNextCh
))
1195 sTmpBuffer
.appendUtf32(nNextCh
);
1199 nNextCh
= GetNextChar();
1202 aToken
= sTmpBuffer
;
1203 sTmpBuffer
.setLength(0);
1207 // TokenString not needed anymore
1208 aToken
.setLength( 0 );
1211 // Read until closing '>'
1212 if( '>' != nNextCh
&& IsParserWorking() )
1216 // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1217 // generate pending HtmlTokenId::<TOKEN>_OFF for HtmlTokenId::<TOKEN>_ON
1218 // Do not convert this to a single HtmlTokenId::<TOKEN>_OFF
1219 // which lead to fdo#56772.
1220 if ((nRet
>= HtmlTokenId::ONOFF_START
) && o3tl::ends_with(aToken
, u
"/"))
1222 mnPendingOffToken
= static_cast<HtmlTokenId
>(static_cast<int>(nRet
) + 1); // HtmlTokenId::<TOKEN>_ON -> HtmlTokenId::<TOKEN>_OFF
1223 aToken
.setLength( aToken
.getLength()-1 ); // remove trailing '/'
1225 if( sal_Unicode(EOF
) == nNextCh
&& rInput
.eof() )
1227 // Move back in front of < and restart there.
1228 // Return < as text.
1229 rInput
.Seek( nStreamPos
);
1230 SetLineNr( nLineNr
);
1231 SetLinePos( nLinePos
);
1232 ClearTxtConvContext();
1235 nRet
= HtmlTokenId::TEXTTOKEN
;
1236 nNextCh
= GetNextChar();
1241 if( SvParserState::Pending
== eState
)
1242 bReadNextChar
= bReadNextCharSave
;
1248 // simply throw away everything
1250 if( sal_Unicode(EOF
) == nNextCh
&& rInput
.eof() )
1252 // Move back in front of < and restart there.
1253 // Return < as text.
1254 rInput
.Seek( nStreamPos
);
1255 SetLineNr( nLineNr
);
1256 SetLinePos( nLinePos
);
1257 ClearTxtConvContext();
1260 nRet
= HtmlTokenId::TEXTTOKEN
;
1261 nNextCh
= GetNextChar();
1265 if( SvParserState::Pending
== eState
)
1266 bReadNextChar
= bReadNextCharSave
;
1267 aToken
.setLength( 0 );
1269 else if( '%' == nNextCh
)
1271 nRet
= HtmlTokenId::UNKNOWNCONTROL_ON
;
1273 sal_uInt64 nCStreamPos
= rInput
.Tell();
1274 sal_uInt32 nCLineNr
= GetLineNr(), nCLinePos
= GetLinePos();
1277 // Read until closing %>. If not found restart at first >.
1278 sal_Unicode nLastTokenChar
= !aToken
.isEmpty() ? aToken
[aToken
.getLength() - 1] : 0;
1279 OUStringBuffer
aTmpBuffer(aToken
);
1280 while( !bDone
&& !rInput
.eof() && IsParserWorking() )
1282 bDone
= '>'==nNextCh
&& nLastTokenChar
== '%';
1285 aTmpBuffer
.appendUtf32(nNextCh
);
1286 nLastTokenChar
= aTmpBuffer
[aTmpBuffer
.getLength() - 1];
1287 nNextCh
= GetNextChar();
1290 if( !bDone
&& IsParserWorking() )
1292 rInput
.Seek( nCStreamPos
);
1293 SetLineNr( nCLineNr
);
1294 SetLinePos( nCLinePos
);
1295 ClearTxtConvContext();
1297 nRet
= HtmlTokenId::TEXTTOKEN
;
1300 aToken
= aTmpBuffer
;
1301 aTmpBuffer
.setLength(0);
1302 if( IsParserWorking() )
1304 sSaveToken
= aToken
;
1305 aToken
.setLength( 0 );
1311 nRet
= HtmlTokenId::TEXTTOKEN
;
1317 if( IsParserWorking() )
1319 bNextCh
= '>' == nNextCh
;
1322 case HtmlTokenId::TEXTAREA_ON
:
1323 bReadTextArea
= true;
1325 case HtmlTokenId::TEXTAREA_OFF
:
1326 bReadTextArea
= false;
1328 case HtmlTokenId::SCRIPT_ON
:
1329 if( !bReadTextArea
)
1332 case HtmlTokenId::SCRIPT_OFF
:
1333 if( !bReadTextArea
)
1335 bReadScript
= false;
1336 // JavaScript might modify the stream,
1337 // thus the last character has to be read again.
1338 bReadNextChar
= true;
1343 case HtmlTokenId::STYLE_ON
:
1346 case HtmlTokenId::STYLE_OFF
:
1355 case sal_Unicode(EOF
):
1358 eState
= SvParserState::Accepted
;
1359 nRet
= HtmlTokenId(nNextCh
);
1363 // Read normal text.
1369 // form feeds are passed upwards separately
1370 nRet
= HtmlTokenId::LINEFEEDCHAR
; // !!! should be FORMFEEDCHAR
1375 if( bReadListing
|| bReadXMP
|| bReadPRE
|| bReadTextArea
)
1377 sal_Unicode c
= GetNextChar();
1378 if( ( '\n' != nNextCh
|| '\r' != c
) &&
1379 ( '\r' != nNextCh
|| '\n' != c
) )
1384 nRet
= HtmlTokenId::NEWPARA
;
1391 nRet
= HtmlTokenId::TABCHAR
;
1400 // "normal" text to come
1402 bNextCh
= 0 == aToken
.getLength();
1404 // the text should be processed
1405 if( !bNextCh
&& eState
== SvParserState::Pending
)
1407 eState
= SvParserState::Working
;
1408 bReadNextChar
= true;
1414 if( bNextCh
&& SvParserState::Working
== eState
)
1416 nNextCh
= GetNextChar();
1417 if( SvParserState::Pending
== eState
&& nRet
!= HtmlTokenId::NONE
&& HtmlTokenId::TEXTTOKEN
!= nRet
)
1419 bReadNextChar
= true;
1420 eState
= SvParserState::Working
;
1424 } while( nRet
== HtmlTokenId::NONE
&& SvParserState::Working
== eState
);
1426 if( SvParserState::Pending
== eState
)
1427 nRet
= HtmlTokenId::INVALID
; // s.th. invalid
1432 void HTMLParser::UnescapeToken()
1436 bool bEscape
= false;
1437 while( nPos
< aToken
.getLength() )
1439 bool bOldEscape
= bEscape
;
1441 if( '\\'==aToken
[nPos
] && !bOldEscape
)
1443 aToken
.remove( nPos
, 1 );
1453 const HTMLOptions
& HTMLParser::GetOptions( HtmlOptionId
const *pNoConvertToken
)
1455 // If the options for the current token have already been returned,
1456 // return them once again.
1457 if (!maOptions
.empty())
1461 while( nPos
< aToken
.getLength() )
1463 // A letter? Option beginning here.
1464 if( rtl::isAsciiAlpha( aToken
[nPos
] ) )
1466 HtmlOptionId nToken
;
1468 sal_Int32 nStt
= nPos
;
1469 sal_Unicode cChar
= 0;
1471 // Actually only certain characters allowed.
1472 // Netscape only looks for "=" and white space (c.f.
1473 // Mozilla: PA_FetchRequestedNameValues in libparse/pa_mdl.c)
1474 while( nPos
< aToken
.getLength() )
1476 cChar
= aToken
[nPos
];
1477 if ( '=' == cChar
||!HTML_ISPRINTABLE(cChar
) || rtl::isAsciiWhiteSpace(cChar
) )
1482 OUString
sName( aToken
.subView( nStt
, nPos
-nStt
) );
1484 // PlugIns require original token name. Convert to lower case only for searching.
1485 nToken
= GetHTMLOption( sName
.toAsciiLowerCase() ); // Name is ready
1486 SAL_WARN_IF( nToken
==HtmlOptionId::UNKNOWN
, "svtools",
1487 "GetOption: unknown HTML option '" << sName
<< "'" );
1488 bool bStripCRLF
= (nToken
< HtmlOptionId::SCRIPT_START
||
1489 nToken
>= HtmlOptionId::SCRIPT_END
) &&
1490 (!pNoConvertToken
|| nToken
!= *pNoConvertToken
);
1492 while( nPos
< aToken
.getLength() )
1494 cChar
= aToken
[nPos
];
1495 if ( HTML_ISPRINTABLE(cChar
) && !rtl::isAsciiWhiteSpace(cChar
) )
1500 // Option with value?
1501 if( nPos
!=aToken
.getLength() && '='==cChar
)
1505 while( nPos
< aToken
.getLength() )
1507 cChar
= aToken
[nPos
];
1508 if ( HTML_ISPRINTABLE(cChar
) && ' ' != cChar
&& '\t' != cChar
&& '\r' != cChar
&& '\n' != cChar
)
1513 if( nPos
!= aToken
.getLength() )
1517 if( ('"'==cChar
) || '\''==cChar
)
1519 sal_Unicode cEnd
= cChar
;
1522 bool bEscape
= false;
1523 while( nPos
< aToken
.getLength() && !bDone
)
1525 bool bOldEscape
= bEscape
;
1527 cChar
= aToken
[nPos
];
1533 aToken
.remove( nPos
, 1 );
1548 aToken
.remove( nPos
, 1 );
1554 bDone
= !bOldEscape
&& cChar
==cEnd
;
1567 if( nPos
!=aToken
.getLength() )
1572 // More liberal than the standard: allow all printable characters
1573 bool bEscape
= false;
1575 while( nPos
< aToken
.getLength() && !bDone
)
1577 bool bOldEscape
= bEscape
;
1579 sal_Unicode c
= aToken
[nPos
];
1583 bDone
= !bOldEscape
;
1605 aToken
.remove( nPos
, 1 );
1611 if( HTML_ISPRINTABLE( c
) )
1624 aValue
= aToken
.subView( nStt
, nLen
);
1628 // Token is known and can be saved
1629 maOptions
.emplace_back(nToken
, sName
, aValue
);
1633 // Ignore white space and unexpected characters
1640 HtmlTokenId
HTMLParser::FilterPRE( HtmlTokenId nToken
)
1644 // in Netscape they only have impact in not empty paragraphs
1645 case HtmlTokenId::PARABREAK_ON
:
1646 nToken
= HtmlTokenId::LINEBREAK
;
1648 case HtmlTokenId::LINEBREAK
:
1649 case HtmlTokenId::NEWPARA
:
1651 if( bPre_IgnoreNewPara
)
1652 nToken
= HtmlTokenId::NONE
;
1655 case HtmlTokenId::TABCHAR
:
1657 sal_Int32 nSpaces
= 8 - (nPre_LinePos
% 8);
1658 DBG_ASSERT( aToken
.isEmpty(), "Why is the token not empty?" );
1659 if (aToken
.getLength() < nSpaces
)
1661 using comphelper::string::padToLength
;
1662 OUStringBuffer
aBuf(aToken
);
1663 aToken
= padToLength(aBuf
, nSpaces
, ' ');
1665 nPre_LinePos
+= nSpaces
;
1666 nToken
= HtmlTokenId::TEXTTOKEN
;
1670 case HtmlTokenId::TEXTTOKEN
:
1671 nPre_LinePos
+= aToken
.getLength();
1674 case HtmlTokenId::SELECT_ON
:
1675 case HtmlTokenId::SELECT_OFF
:
1676 case HtmlTokenId::BODY_ON
:
1677 case HtmlTokenId::FORM_ON
:
1678 case HtmlTokenId::FORM_OFF
:
1679 case HtmlTokenId::INPUT
:
1680 case HtmlTokenId::OPTION
:
1681 case HtmlTokenId::TEXTAREA_ON
:
1682 case HtmlTokenId::TEXTAREA_OFF
:
1684 case HtmlTokenId::IMAGE
:
1685 case HtmlTokenId::APPLET_ON
:
1686 case HtmlTokenId::APPLET_OFF
:
1687 case HtmlTokenId::PARAM
:
1688 case HtmlTokenId::EMBED
:
1690 case HtmlTokenId::HEAD1_ON
:
1691 case HtmlTokenId::HEAD1_OFF
:
1692 case HtmlTokenId::HEAD2_ON
:
1693 case HtmlTokenId::HEAD2_OFF
:
1694 case HtmlTokenId::HEAD3_ON
:
1695 case HtmlTokenId::HEAD3_OFF
:
1696 case HtmlTokenId::HEAD4_ON
:
1697 case HtmlTokenId::HEAD4_OFF
:
1698 case HtmlTokenId::HEAD5_ON
:
1699 case HtmlTokenId::HEAD5_OFF
:
1700 case HtmlTokenId::HEAD6_ON
:
1701 case HtmlTokenId::HEAD6_OFF
:
1702 case HtmlTokenId::BLOCKQUOTE_ON
:
1703 case HtmlTokenId::BLOCKQUOTE_OFF
:
1704 case HtmlTokenId::ADDRESS_ON
:
1705 case HtmlTokenId::ADDRESS_OFF
:
1706 case HtmlTokenId::HORZRULE
:
1708 case HtmlTokenId::CENTER_ON
:
1709 case HtmlTokenId::CENTER_OFF
:
1710 case HtmlTokenId::DIVISION_ON
:
1711 case HtmlTokenId::DIVISION_OFF
:
1713 case HtmlTokenId::SCRIPT_ON
:
1714 case HtmlTokenId::SCRIPT_OFF
:
1715 case HtmlTokenId::RAWDATA
:
1717 case HtmlTokenId::TABLE_ON
:
1718 case HtmlTokenId::TABLE_OFF
:
1719 case HtmlTokenId::CAPTION_ON
:
1720 case HtmlTokenId::CAPTION_OFF
:
1721 case HtmlTokenId::COLGROUP_ON
:
1722 case HtmlTokenId::COLGROUP_OFF
:
1723 case HtmlTokenId::COL_ON
:
1724 case HtmlTokenId::COL_OFF
:
1725 case HtmlTokenId::THEAD_ON
:
1726 case HtmlTokenId::THEAD_OFF
:
1727 case HtmlTokenId::TFOOT_ON
:
1728 case HtmlTokenId::TFOOT_OFF
:
1729 case HtmlTokenId::TBODY_ON
:
1730 case HtmlTokenId::TBODY_OFF
:
1731 case HtmlTokenId::TABLEROW_ON
:
1732 case HtmlTokenId::TABLEROW_OFF
:
1733 case HtmlTokenId::TABLEDATA_ON
:
1734 case HtmlTokenId::TABLEDATA_OFF
:
1735 case HtmlTokenId::TABLEHEADER_ON
:
1736 case HtmlTokenId::TABLEHEADER_OFF
:
1738 case HtmlTokenId::ANCHOR_ON
:
1739 case HtmlTokenId::ANCHOR_OFF
:
1740 case HtmlTokenId::BOLD_ON
:
1741 case HtmlTokenId::BOLD_OFF
:
1742 case HtmlTokenId::ITALIC_ON
:
1743 case HtmlTokenId::ITALIC_OFF
:
1744 case HtmlTokenId::STRIKE_ON
:
1745 case HtmlTokenId::STRIKE_OFF
:
1746 case HtmlTokenId::STRIKETHROUGH_ON
:
1747 case HtmlTokenId::STRIKETHROUGH_OFF
:
1748 case HtmlTokenId::UNDERLINE_ON
:
1749 case HtmlTokenId::UNDERLINE_OFF
:
1750 case HtmlTokenId::BASEFONT_ON
:
1751 case HtmlTokenId::BASEFONT_OFF
:
1752 case HtmlTokenId::FONT_ON
:
1753 case HtmlTokenId::FONT_OFF
:
1754 case HtmlTokenId::BLINK_ON
:
1755 case HtmlTokenId::BLINK_OFF
:
1756 case HtmlTokenId::SPAN_ON
:
1757 case HtmlTokenId::SPAN_OFF
:
1758 case HtmlTokenId::SUBSCRIPT_ON
:
1759 case HtmlTokenId::SUBSCRIPT_OFF
:
1760 case HtmlTokenId::SUPERSCRIPT_ON
:
1761 case HtmlTokenId::SUPERSCRIPT_OFF
:
1762 case HtmlTokenId::BIGPRINT_ON
:
1763 case HtmlTokenId::BIGPRINT_OFF
:
1764 case HtmlTokenId::SMALLPRINT_OFF
:
1765 case HtmlTokenId::SMALLPRINT_ON
:
1767 case HtmlTokenId::EMPHASIS_ON
:
1768 case HtmlTokenId::EMPHASIS_OFF
:
1769 case HtmlTokenId::CITATION_ON
:
1770 case HtmlTokenId::CITATION_OFF
:
1771 case HtmlTokenId::STRONG_ON
:
1772 case HtmlTokenId::STRONG_OFF
:
1773 case HtmlTokenId::CODE_ON
:
1774 case HtmlTokenId::CODE_OFF
:
1775 case HtmlTokenId::SAMPLE_ON
:
1776 case HtmlTokenId::SAMPLE_OFF
:
1777 case HtmlTokenId::KEYBOARD_ON
:
1778 case HtmlTokenId::KEYBOARD_OFF
:
1779 case HtmlTokenId::VARIABLE_ON
:
1780 case HtmlTokenId::VARIABLE_OFF
:
1781 case HtmlTokenId::DEFINSTANCE_ON
:
1782 case HtmlTokenId::DEFINSTANCE_OFF
:
1783 case HtmlTokenId::SHORTQUOTE_ON
:
1784 case HtmlTokenId::SHORTQUOTE_OFF
:
1785 case HtmlTokenId::LANGUAGE_ON
:
1786 case HtmlTokenId::LANGUAGE_OFF
:
1787 case HtmlTokenId::AUTHOR_ON
:
1788 case HtmlTokenId::AUTHOR_OFF
:
1789 case HtmlTokenId::PERSON_ON
:
1790 case HtmlTokenId::PERSON_OFF
:
1791 case HtmlTokenId::ACRONYM_ON
:
1792 case HtmlTokenId::ACRONYM_OFF
:
1793 case HtmlTokenId::ABBREVIATION_ON
:
1794 case HtmlTokenId::ABBREVIATION_OFF
:
1795 case HtmlTokenId::INSERTEDTEXT_ON
:
1796 case HtmlTokenId::INSERTEDTEXT_OFF
:
1797 case HtmlTokenId::DELETEDTEXT_ON
:
1798 case HtmlTokenId::DELETEDTEXT_OFF
:
1799 case HtmlTokenId::TELETYPE_ON
:
1800 case HtmlTokenId::TELETYPE_OFF
:
1804 // The remainder is treated as an unknown token.
1806 if( nToken
!= HtmlTokenId::NONE
)
1809 ( ((nToken
>= HtmlTokenId::ONOFF_START
) && isOffToken(nToken
))
1810 ? HtmlTokenId::UNKNOWNCONTROL_OFF
1811 : HtmlTokenId::UNKNOWNCONTROL_ON
);
1816 bPre_IgnoreNewPara
= false;
1821 HtmlTokenId
HTMLParser::FilterXMP( HtmlTokenId nToken
)
1825 case HtmlTokenId::NEWPARA
:
1826 if( bPre_IgnoreNewPara
)
1827 nToken
= HtmlTokenId::NONE
;
1829 case HtmlTokenId::TEXTTOKEN
:
1830 case HtmlTokenId::NONBREAKSPACE
:
1831 case HtmlTokenId::SOFTHYPH
:
1835 if( nToken
!= HtmlTokenId::NONE
)
1837 if( (nToken
>= HtmlTokenId::ONOFF_START
) && isOffToken(nToken
) )
1839 sSaveToken
= "</" + sSaveToken
;
1842 sSaveToken
= "<" + sSaveToken
;
1843 if( !aToken
.isEmpty() )
1847 aToken
.insert(0, sSaveToken
);
1850 aToken
= sSaveToken
;
1851 aToken
.append( ">" );
1852 nToken
= HtmlTokenId::TEXTTOKEN
;
1857 bPre_IgnoreNewPara
= false;
1862 HtmlTokenId
HTMLParser::FilterListing( HtmlTokenId nToken
)
1866 case HtmlTokenId::NEWPARA
:
1867 if( bPre_IgnoreNewPara
)
1868 nToken
= HtmlTokenId::NONE
;
1870 case HtmlTokenId::TEXTTOKEN
:
1871 case HtmlTokenId::NONBREAKSPACE
:
1872 case HtmlTokenId::SOFTHYPH
:
1876 if( nToken
!= HtmlTokenId::NONE
)
1879 ( ((nToken
>= HtmlTokenId::ONOFF_START
) && isOffToken(nToken
))
1880 ? HtmlTokenId::UNKNOWNCONTROL_OFF
1881 : HtmlTokenId::UNKNOWNCONTROL_ON
);
1886 bPre_IgnoreNewPara
= false;
1891 bool HTMLParser::InternalImgToPrivateURL( OUString
& rURL
)
1893 bool bFound
= false;
1895 if( rURL
.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon
) )
1897 OUString
aName( rURL
.copy(14) );
1901 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata
;
1904 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed
;
1907 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_embed
;
1910 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure
;
1913 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound
;
1919 OUString
sTmp ( rURL
);
1920 rURL
= OOO_STRING_SVTOOLS_HTML_private_image
;
1929 enum class HtmlMeta
{
1948 HTMLOptionEnum
<HtmlMeta
> const aHTMLMetaNameTable
[] =
1950 { OOO_STRING_SVTOOLS_HTML_META_author
, HtmlMeta::Author
},
1951 { OOO_STRING_SVTOOLS_HTML_META_changed
, HtmlMeta::Changed
},
1952 { OOO_STRING_SVTOOLS_HTML_META_changedby
, HtmlMeta::ChangedBy
},
1953 { OOO_STRING_SVTOOLS_HTML_META_classification
,HtmlMeta::Classification
},
1954 { OOO_STRING_SVTOOLS_HTML_META_content_type
, HtmlMeta::ContentType
},
1955 { OOO_STRING_SVTOOLS_HTML_META_created
, HtmlMeta::Created
},
1956 { OOO_STRING_SVTOOLS_HTML_META_description
, HtmlMeta::Description
},
1957 { OOO_STRING_SVTOOLS_HTML_META_keywords
, HtmlMeta::Keywords
},
1958 { OOO_STRING_SVTOOLS_HTML_META_generator
, HtmlMeta::Generator
},
1959 { OOO_STRING_SVTOOLS_HTML_META_refresh
, HtmlMeta::Refresh
},
1960 { OOO_STRING_SVTOOLS_HTML_META_sdendnote
, HtmlMeta::SDEndnote
},
1961 { OOO_STRING_SVTOOLS_HTML_META_sdfootnote
, HtmlMeta::SDFootnote
},
1962 { nullptr, HtmlMeta(0) }
1966 void HTMLParser::AddMetaUserDefined( OUString
const & )
1970 bool HTMLParser::ParseMetaOptionsImpl(
1971 const uno::Reference
<document::XDocumentProperties
> & i_xDocProps
,
1972 SvKeyValueIterator
*i_pHTTPHeader
,
1973 const HTMLOptions
& aOptions
,
1974 rtl_TextEncoding
& o_rEnc
)
1976 OUString aName
, aContent
;
1977 HtmlMeta nAction
= HtmlMeta::NONE
;
1978 bool bHTTPEquiv
= false, bChanged
= false;
1980 for ( size_t i
= aOptions
.size(); i
; )
1982 const HTMLOption
& aOption
= aOptions
[--i
];
1983 switch ( aOption
.GetToken() )
1985 case HtmlOptionId::NAME
:
1986 aName
= aOption
.GetString();
1987 if ( HtmlMeta::NONE
==nAction
)
1989 aOption
.GetEnum( nAction
, aHTMLMetaNameTable
);
1992 case HtmlOptionId::HTTPEQUIV
:
1993 aName
= aOption
.GetString();
1994 aOption
.GetEnum( nAction
, aHTMLMetaNameTable
);
1997 case HtmlOptionId::CONTENT
:
1998 aContent
= aOption
.GetString();
2000 case HtmlOptionId::CHARSET
:
2002 OString
sValue(OUStringToOString(aOption
.GetString(), RTL_TEXTENCODING_ASCII_US
));
2003 o_rEnc
= GetExtendedCompatibilityTextEncoding(rtl_getTextEncodingFromMimeCharset(sValue
.getStr()));
2010 if ( bHTTPEquiv
|| HtmlMeta::Description
!= nAction
)
2012 // if it is not a Description, remove CRs and LFs from CONTENT
2013 aContent
= aContent
.replaceAll("\r", "").replaceAll("\n", "");
2017 // convert line endings for Description
2018 aContent
= convertLineEnd(aContent
, GetSystemLineEnd());
2021 if ( bHTTPEquiv
&& i_pHTTPHeader
)
2023 // Netscape seems to just ignore a closing ", so we do too
2024 if ( aContent
.endsWith("\"") )
2026 aContent
= aContent
.copy( 0, aContent
.getLength() - 1 );
2028 SvKeyValue
aKeyValue( aName
, aContent
);
2029 i_pHTTPHeader
->Append( aKeyValue
);
2034 case HtmlMeta::Author
:
2035 if (i_xDocProps
.is()) {
2036 i_xDocProps
->setAuthor( aContent
);
2040 case HtmlMeta::Description
:
2041 if (i_xDocProps
.is()) {
2042 i_xDocProps
->setDescription( aContent
);
2046 case HtmlMeta::Keywords
:
2047 if (i_xDocProps
.is()) {
2048 i_xDocProps
->setKeywords(
2049 ::comphelper::string::convertCommaSeparated(aContent
));
2053 case HtmlMeta::Classification
:
2054 if (i_xDocProps
.is()) {
2055 i_xDocProps
->setSubject( aContent
);
2060 case HtmlMeta::ChangedBy
:
2061 if (i_xDocProps
.is()) {
2062 i_xDocProps
->setModifiedBy( aContent
);
2067 case HtmlMeta::Created
:
2068 case HtmlMeta::Changed
:
2069 if (i_xDocProps
.is() && !aContent
.isEmpty())
2071 ::util::DateTime uDT
;
2073 if (comphelper::string::getTokenCount(aContent
, ';') == 2)
2075 sal_Int32 nIdx
{ 0 };
2076 sal_Int32 nDate
= o3tl::toInt32(o3tl::getToken(aContent
, 0, ';', nIdx
));
2077 sal_Int64 nTime
= o3tl::toInt64(o3tl::getToken(aContent
, 0, ';', nIdx
));
2078 valid
= nDate
!= std::numeric_limits
<sal_Int32
>::min() &&
2079 nTime
!= std::numeric_limits
<sal_Int64
>::min();
2083 tools::Time
aTime(tools::Time::fromEncodedTime(nTime
));
2084 uDT
= DateTime(aDate
, aTime
).GetUNODateTime();
2087 else if (utl::ISO8601parseDateTime(aContent
, uDT
))
2093 if (HtmlMeta::Created
== nAction
)
2094 i_xDocProps
->setCreationDate(uDT
);
2096 i_xDocProps
->setModificationDate(uDT
);
2101 case HtmlMeta::Refresh
:
2102 DBG_ASSERT( !bHTTPEquiv
|| i_pHTTPHeader
, "Lost Reload-URL because of omitted MUST change." );
2105 case HtmlMeta::ContentType
:
2106 if ( !aContent
.isEmpty() )
2108 o_rEnc
= GetEncodingByMIME( aContent
);
2112 case HtmlMeta::NONE
:
2115 if (i_xDocProps
.is())
2117 uno::Reference
<beans::XPropertyContainer
> xUDProps
2118 = i_xDocProps
->getUserDefinedProperties();
2120 xUDProps
->addProperty(aName
,
2121 beans::PropertyAttribute::REMOVABLE
,
2122 uno::Any(aContent
));
2123 AddMetaUserDefined(aName
);
2125 } catch (uno::Exception
&) {
2138 bool HTMLParser::ParseMetaOptions(
2139 const uno::Reference
<document::XDocumentProperties
> & i_xDocProps
,
2140 SvKeyValueIterator
*i_pHeader
)
2142 HtmlOptionId nContentOption
= HtmlOptionId::CONTENT
;
2143 rtl_TextEncoding eEnc
= RTL_TEXTENCODING_DONTKNOW
;
2145 bool bRet
= ParseMetaOptionsImpl( i_xDocProps
, i_pHeader
,
2146 GetOptions(&nContentOption
),
2149 // If the encoding is set by a META tag, it may only overwrite the
2150 // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2151 // encodings. Everything else cannot lead to reasonable results.
2152 if (RTL_TEXTENCODING_DONTKNOW
!= eEnc
&&
2153 rtl_isOctetTextEncoding( eEnc
) &&
2154 rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2156 eEnc
= GetExtendedCompatibilityTextEncoding( eEnc
);
2157 SetSrcEncoding( eEnc
);
2163 rtl_TextEncoding
HTMLParser::GetEncodingByMIME( const OUString
& rMime
)
2167 INetContentTypeParameterList aParameters
;
2168 if (INetContentTypes::parse(rMime
, sType
, sSubType
, &aParameters
))
2170 auto const iter
= aParameters
.find("charset"_ostr
);
2171 if (iter
!= aParameters
.end())
2173 const INetContentTypeParameter
* pCharset
= &iter
->second
;
2174 OString
sValue(OUStringToOString(pCharset
->m_sValue
, RTL_TEXTENCODING_ASCII_US
));
2175 return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue
.getStr() ) );
2178 return RTL_TEXTENCODING_DONTKNOW
;
2181 rtl_TextEncoding
HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator
*pHTTPHeader
)
2183 rtl_TextEncoding eRet
= RTL_TEXTENCODING_DONTKNOW
;
2187 for( bool bCont
= pHTTPHeader
->GetFirst( aKV
); bCont
;
2188 bCont
= pHTTPHeader
->GetNext( aKV
) )
2190 if( aKV
.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type
) )
2192 if( !aKV
.GetValue().isEmpty() )
2194 eRet
= HTMLParser::GetEncodingByMIME( aKV
.GetValue() );
2202 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator
*pHTTPHeader
)
2205 rtl_TextEncoding eEnc
= HTMLParser::GetEncodingByHttpHeader( pHTTPHeader
);
2206 if(RTL_TEXTENCODING_DONTKNOW
!= eEnc
)
2208 SetSrcEncoding( eEnc
);
2215 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */