1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 #include <comphelper/string.hxx>
23 #include <o3tl/ptr_container.hxx>
24 #include <tools/stream.hxx>
25 #include <tools/debug.hxx>
26 #include <tools/color.hxx>
27 #include <rtl/ustrbuf.hxx>
28 #include <rtl/strbuf.hxx>
30 #include <tools/tenccvt.hxx>
31 #include <tools/datetime.hxx>
32 #include <svl/inettype.hxx>
33 #include <com/sun/star/beans/PropertyAttribute.hpp>
34 #include <com/sun/star/document/XDocumentProperties.hpp>
36 #include <svtools/parhtml.hxx>
37 #include <svtools/htmltokn.h>
38 #include <svtools/htmlkywd.hxx>
43 using namespace ::com::sun::star
;
46 const sal_Int32
MAX_LEN( 1024L );
48 const sal_Int32
MAX_ENTITY_LEN( 8L );
51 // Tables to convert option values into strings
54 static HTMLOptionEnum
const aInputTypeOptEnums
[] =
56 { OOO_STRING_SVTOOLS_HTML_IT_text
, HTML_IT_TEXT
},
57 { OOO_STRING_SVTOOLS_HTML_IT_password
, HTML_IT_PASSWORD
},
58 { OOO_STRING_SVTOOLS_HTML_IT_checkbox
, HTML_IT_CHECKBOX
},
59 { OOO_STRING_SVTOOLS_HTML_IT_radio
, HTML_IT_RADIO
},
60 { OOO_STRING_SVTOOLS_HTML_IT_range
, HTML_IT_RANGE
},
61 { OOO_STRING_SVTOOLS_HTML_IT_scribble
, HTML_IT_SCRIBBLE
},
62 { OOO_STRING_SVTOOLS_HTML_IT_file
, HTML_IT_FILE
},
63 { OOO_STRING_SVTOOLS_HTML_IT_hidden
, HTML_IT_HIDDEN
},
64 { OOO_STRING_SVTOOLS_HTML_IT_submit
, HTML_IT_SUBMIT
},
65 { OOO_STRING_SVTOOLS_HTML_IT_image
, HTML_IT_IMAGE
},
66 { OOO_STRING_SVTOOLS_HTML_IT_reset
, HTML_IT_RESET
},
67 { OOO_STRING_SVTOOLS_HTML_IT_button
, HTML_IT_BUTTON
},
72 static HTMLOptionEnum
const aTableFrameOptEnums
[] =
74 { OOO_STRING_SVTOOLS_HTML_TF_void
, HTML_TF_VOID
},
75 { OOO_STRING_SVTOOLS_HTML_TF_above
, HTML_TF_ABOVE
},
76 { OOO_STRING_SVTOOLS_HTML_TF_below
, HTML_TF_BELOW
},
77 { OOO_STRING_SVTOOLS_HTML_TF_hsides
, HTML_TF_HSIDES
},
78 { OOO_STRING_SVTOOLS_HTML_TF_lhs
, HTML_TF_LHS
},
79 { OOO_STRING_SVTOOLS_HTML_TF_rhs
, HTML_TF_RHS
},
80 { OOO_STRING_SVTOOLS_HTML_TF_vsides
, HTML_TF_VSIDES
},
81 { OOO_STRING_SVTOOLS_HTML_TF_box
, HTML_TF_BOX
},
82 { OOO_STRING_SVTOOLS_HTML_TF_border
, HTML_TF_BOX
},
87 static HTMLOptionEnum
const aTableRulesOptEnums
[] =
89 { OOO_STRING_SVTOOLS_HTML_TR_none
, HTML_TR_NONE
},
90 { OOO_STRING_SVTOOLS_HTML_TR_groups
, HTML_TR_GROUPS
},
91 { OOO_STRING_SVTOOLS_HTML_TR_rows
, HTML_TR_ROWS
},
92 { OOO_STRING_SVTOOLS_HTML_TR_cols
, HTML_TR_COLS
},
93 { OOO_STRING_SVTOOLS_HTML_TR_all
, HTML_TR_ALL
},
97 sal_uInt16
HTMLOption::GetEnum( const HTMLOptionEnum
*pOptEnums
, sal_uInt16 nDflt
) const
99 sal_uInt16 nValue
= nDflt
;
101 while( pOptEnums
->pName
)
102 if( aValue
.equalsIgnoreAsciiCaseAscii( pOptEnums
->pName
) )
107 if( pOptEnums
->pName
)
108 nValue
= pOptEnums
->nValue
;
113 bool HTMLOption::GetEnum( sal_uInt16
&rEnum
, const HTMLOptionEnum
*pOptEnums
) const
115 while( pOptEnums
->pName
)
117 if( aValue
.equalsIgnoreAsciiCaseAscii( pOptEnums
->pName
) )
123 const sal_Char
*pName
= pOptEnums
->pName
;
125 rEnum
= pOptEnums
->nValue
;
130 HTMLOption::HTMLOption( sal_uInt16 nTok
, const OUString
& rToken
,
131 const OUString
& rValue
)
136 DBG_ASSERT( nToken
>=HTML_OPTION_START
&& nToken
<HTML_OPTION_END
,
137 "HTMLOption: unknown token" );
140 sal_uInt32
HTMLOption::GetNumber() const
142 DBG_ASSERT( (nToken
>=HTML_OPTION_NUMBER_START
&&
143 nToken
<HTML_OPTION_NUMBER_END
) ||
144 (nToken
>=HTML_OPTION_CONTEXT_START
&&
145 nToken
<HTML_OPTION_CONTEXT_END
) ||
146 nToken
==HTML_O_VALUE
,
147 "GetNumber: Option not numerical" );
148 OUString
aTmp(comphelper::string::stripStart(aValue
, ' '));
149 sal_Int32 nTmp
= aTmp
.toInt32();
150 return nTmp
>= 0 ? (sal_uInt32
)nTmp
: 0;
153 sal_Int32
HTMLOption::GetSNumber() const
155 DBG_ASSERT( (nToken
>=HTML_OPTION_NUMBER_START
&& nToken
<HTML_OPTION_NUMBER_END
) ||
156 (nToken
>=HTML_OPTION_CONTEXT_START
&& nToken
<HTML_OPTION_CONTEXT_END
),
157 "GetSNumber: Option not numerical" );
158 OUString
aTmp(comphelper::string::stripStart(aValue
, ' '));
159 return aTmp
.toInt32();
162 void HTMLOption::GetNumbers( std::vector
<sal_uInt32
> &rNumbers
, bool bSpaceDelim
) const
168 // This is a very simplified scanner: it only searches all
169 // numerals in the string.
172 for( sal_Int32 i
=0; i
<aValue
.getLength(); i
++ )
174 sal_Unicode c
= aValue
[ i
];
175 if( c
>='0' && c
<='9' )
183 rNumbers
.push_back( nNum
);
190 rNumbers
.push_back( nNum
);
195 // Check whether numbers are separated by ',' and
196 // insert 0 if necessary
198 while( nPos
< aValue
.getLength() )
201 while( nPos
< aValue
.getLength() &&
202 ((c
=aValue
[nPos
]) == ' ' || c
== '\t' ||
203 c
== '\n' || c
== '\r' ) )
206 if( nPos
==aValue
.getLength() )
207 rNumbers
.push_back(0);
210 sal_Int32 nEnd
= aValue
.indexOf( (sal_Unicode
)',', nPos
);
213 sal_Int32 nTmp
= aValue
.copy(nPos
).toInt32();
214 rNumbers
.push_back( nTmp
>= 0 ? (sal_uInt32
)nTmp
: 0 );
215 nPos
= aValue
.getLength();
219 sal_Int32 nTmp
= aValue
.copy(nPos
,nEnd
-nPos
).toInt32();
220 rNumbers
.push_back( nTmp
>= 0 ? (sal_uInt32
)nTmp
: 0 );
228 void HTMLOption::GetColor( Color
& rColor
) const
230 DBG_ASSERT( (nToken
>=HTML_OPTION_COLOR_START
&& nToken
<HTML_OPTION_COLOR_END
) || nToken
==HTML_O_SIZE
,
231 "GetColor: Option is not a color." );
233 OUString
aTmp(aValue
.toAsciiLowerCase());
234 sal_uInt32 nColor
= SAL_MAX_UINT32
;
235 if (!aTmp
.isEmpty() && aTmp
[0] != '#')
236 nColor
= GetHTMLColor(aTmp
);
238 if( SAL_MAX_UINT32
== nColor
)
242 for (sal_uInt32 i
=0; i
<6; ++i
)
244 // Whatever Netscape does to get color values,
245 // at maximum three characters < '0' are ignored.
246 sal_Unicode c
= nPos
<aTmp
.getLength() ? aTmp
[ nPos
++ ] : '0';
249 c
= nPos
<aTmp
.getLength() ? aTmp
[nPos
++] : '0';
251 c
= nPos
<aTmp
.getLength() ? aTmp
[nPos
++] : '0';
254 if( c
>= '0' && c
<= '9' )
256 else if( c
>= 'a' && c
<= 'f' )
257 nColor
+= (c
+ 0xa - 'a');
261 rColor
.SetRed( (sal_uInt8
)((nColor
& 0x00ff0000) >> 16) );
262 rColor
.SetGreen( (sal_uInt8
)((nColor
& 0x0000ff00) >> 8));
263 rColor
.SetBlue( (sal_uInt8
)(nColor
& 0x000000ff) );
266 HTMLInputType
HTMLOption::GetInputType() const
268 DBG_ASSERT( nToken
==HTML_O_TYPE
, "GetInputType: Option not TYPE" );
269 return (HTMLInputType
)GetEnum( aInputTypeOptEnums
, HTML_IT_TEXT
);
272 HTMLTableFrame
HTMLOption::GetTableFrame() const
274 DBG_ASSERT( nToken
==HTML_O_FRAME
, "GetTableFrame: Option not FRAME" );
275 return (HTMLTableFrame
)GetEnum( aTableFrameOptEnums
, HTML_TF_VOID
);
278 HTMLTableRules
HTMLOption::GetTableRules() const
280 DBG_ASSERT( nToken
==HTML_O_RULES
, "GetTableRules: Option not RULES" );
281 return (HTMLTableRules
)GetEnum( aTableRulesOptEnums
, HTML_TR_NONE
);
284 HTMLParser::HTMLParser( SvStream
& rIn
, bool bReadNewDoc
) :
286 bNewDoc(bReadNewDoc
),
292 bReadTextArea(false),
295 bEndTokenFound(false),
296 bPre_IgnoreNewPara(false),
297 bReadNextChar(false),
302 //#i76649, default to UTF-8 for HTML unless we know differently
303 SetSrcEncoding(RTL_TEXTENCODING_UTF8
);
306 HTMLParser::~HTMLParser()
310 SvParserState
HTMLParser::CallParser()
312 eState
= SVPAR_WORKING
;
313 nNextCh
= GetNextChar();
317 bPre_IgnoreNewPara
= false;
321 if( SVPAR_PENDING
!= eState
)
322 ReleaseRef(); // Parser not needed anymore
327 void HTMLParser::Continue( int nToken
)
330 nToken
= GetNextToken();
332 while( IsParserWorking() )
335 nToken
= FilterToken( nToken
);
340 if( IsParserWorking() )
341 SaveState( 0 ); // continue with new token
343 nToken
= GetNextToken();
347 int HTMLParser::FilterToken( int nToken
)
351 case sal_Unicode(EOF
):
369 case HTML_FRAMESET_ON
:
375 bIsInBody
= bReadPRE
= bReadListing
= bReadXMP
= false;
380 bReadPRE
= bReadListing
= bReadXMP
= false;
381 break; // HTML_ON hasn't been passed either !
383 case HTML_PREFORMTXT_ON
:
387 case HTML_PREFORMTXT_OFF
:
391 case HTML_LISTING_ON
:
395 case HTML_LISTING_OFF
:
409 nToken
= FilterPRE( nToken
);
410 else if( bReadListing
)
411 nToken
= FilterListing( nToken
);
413 nToken
= FilterXMP( nToken
);
421 #define HTML_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
422 #define HTML_ISALPHA( c ) comphelper::string::isalphaAscii(c)
423 #define HTML_ISALNUM( c ) comphelper::string::isalnumAscii(c)
424 #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) )
425 #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127)
426 #define HTML_ISHEXDIGIT( c ) comphelper::string::isxdigitAscii(c)
428 int HTMLParser::ScanText( const sal_Unicode cBreak
)
430 OUStringBuffer
sTmpBuffer( MAX_LEN
);
431 bool bContinue
= true;
432 bool bEqSignFound
= false;
433 sal_Unicode cQuote
= 0U;
435 while( bContinue
&& IsParserWorking() )
441 bEqSignFound
= false;
443 sTmpBuffer
.append( '&' );
446 sal_uLong nStreamPos
= rInput
.Tell();
447 sal_uLong nLinePos
= GetLinePos();
449 sal_Unicode cChar
= 0U;
450 if( '#' == (nNextCh
= GetNextChar()) )
452 nNextCh
= GetNextChar();
453 const bool bIsHex( 'x' == nNextCh
);
454 const bool bIsDecOrHex( bIsHex
|| HTML_ISDIGIT(nNextCh
) );
459 nNextCh
= GetNextChar();
460 while ( HTML_ISHEXDIGIT(nNextCh
) )
462 cChar
= cChar
* 16U +
464 ? sal_Unicode( nNextCh
- '0' )
466 ? sal_Unicode( nNextCh
- 'A' + 10 )
467 : sal_Unicode( nNextCh
- 'a' + 10 ) ) );
468 nNextCh
= GetNextChar();
475 cChar
= cChar
* 10U + sal_Unicode( nNextCh
- '0');
476 nNextCh
= GetNextChar();
478 while( HTML_ISDIGIT(nNextCh
) );
481 if( RTL_TEXTENCODING_DONTKNOW
!= eSrcEnc
&&
482 RTL_TEXTENCODING_UCS2
!= eSrcEnc
&&
483 RTL_TEXTENCODING_UTF8
!= eSrcEnc
&&
486 const sal_uInt32 convertFlags
=
487 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
|
488 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
|
489 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
;
491 sal_Char cEncodedChar
= static_cast<sal_Char
>(cChar
);
492 cChar
= OUString(&cEncodedChar
, 1, eSrcEnc
, convertFlags
).toChar();
495 // If the character could not be
496 // converted, because a conversion is not
497 // available, do no conversion at all.
498 cChar
= cEncodedChar
;
505 else if( HTML_ISALPHA( nNextCh
) )
507 OUStringBuffer
sEntityBuffer( MAX_ENTITY_LEN
);
511 sEntityBuffer
.append( nNextCh
);
513 nNextCh
= GetNextChar();
515 while( nPos
< MAX_ENTITY_LEN
&& HTML_ISALNUM( nNextCh
) &&
518 if( IsParserWorking() && !rInput
.IsEof() )
520 OUString
sEntity(sEntityBuffer
.getStr(), nPos
);
521 cChar
= GetHTMLCharName( sEntity
);
523 // not found ( == 0 ): plain text
524 // or a character which is inserted as attribute
525 if( 0U == cChar
&& ';' != nNextCh
)
527 DBG_ASSERT( rInput
.Tell() - nStreamPos
==
528 (sal_uLong
)(nPos
+1L)*GetCharSize(),
529 "UTF-8 is failing here" );
530 for( sal_Int32 i
= nPos
-1; i
>1; i
-- )
532 nNextCh
= sEntityBuffer
[i
];
533 sEntityBuffer
.setLength( i
);
534 sEntity
= OUString(sEntityBuffer
.getStr(), i
);
535 cChar
= GetHTMLCharName( sEntity
);
538 rInput
.SeekRel( -(long)
539 ((nPos
-i
)*GetCharSize()) );
540 nlLinePos
-= sal_uInt32(nPos
-i
);
542 ClearTxtConvContext();
548 if( !cChar
) // unknown character?
550 // back in stream, insert '&'
551 // and restart with next character
552 sTmpBuffer
.append( '&' );
554 DBG_ASSERT( rInput
.Tell()-nStreamPos
==
555 (sal_uLong
)(nPos
+1)*GetCharSize(),
556 "Wrong stream position" );
557 DBG_ASSERT( nlLinePos
-nLinePos
==
559 "Wrong line position" );
560 rInput
.Seek( nStreamPos
);
561 nlLinePos
= nLinePos
;
562 ClearTxtConvContext();
568 // 1 == Non Breaking Space
571 if (cChar
== 1 || cChar
== 2)
575 // When reading the content of a tag we have
576 // to change it to ' ' or '-'
584 // If not scanning a tag return token
585 aToken
+= sTmpBuffer
.makeStringAndClear();
587 if( !aToken
.isEmpty() )
589 // restart with character
591 DBG_ASSERT( rInput
.Tell()-nStreamPos
==
592 (sal_uLong
)(nPos
+1)*GetCharSize(),
593 "Wrong stream position" );
594 DBG_ASSERT( nlLinePos
-nLinePos
==
596 "Wrong line position" );
597 rInput
.Seek( nStreamPos
);
598 nlLinePos
= nLinePos
;
599 ClearTxtConvContext();
600 return HTML_TEXTTOKEN
;
603 // Hack: _GetNextChar shall not read the
608 return HTML_NONBREAKSPACE
;
610 return HTML_SOFTHYPH
;
617 // &{...};-JavaScript-Macros are not supported any longer.
618 else if( IsParserWorking() )
620 sTmpBuffer
.append( '&' );
625 bNextCh
= (';' == nNextCh
);
626 if( cBreak
=='>' && (cChar
=='\\' || cChar
=='\'' ||
627 cChar
=='\"' || cChar
==' ') )
629 // ' and " have to be escaped within tags to separate
630 // them from ' and " enclosing options.
631 // \ has to be escaped as well.
632 // Space is protected because it's not a delimiter between
634 sTmpBuffer
.append( '\\' );
635 if( MAX_LEN
== sTmpBuffer
.getLength() )
636 aToken
+= sTmpBuffer
.makeStringAndClear();
638 if( IsParserWorking() )
641 sTmpBuffer
.append( cChar
);
643 else if( SVPAR_PENDING
==eState
&& '>'!=cBreak
)
645 // Restart with '&', the remainder is returned as
647 if( !aToken
.isEmpty() || !sTmpBuffer
.isEmpty() )
649 // _GetNextChar() returns the previous text and
650 // during the next execution a new character is read.
651 // Thus we have to position in front of the '&'.
653 rInput
.Seek( nStreamPos
-(sal_uInt32
)GetCharSize() );
654 nlLinePos
= nLinePos
-1;
655 ClearTxtConvContext();
656 bReadNextChar
= true;
663 if( '>'==cBreak
&& !cQuote
)
665 sTmpBuffer
.append( nNextCh
);
671 // Innerhalb von Tags kennzeichnen
672 sTmpBuffer
.append( '\\' );
673 if( MAX_LEN
== sTmpBuffer
.getLength() )
674 aToken
+= sTmpBuffer
.makeStringAndClear();
676 sTmpBuffer
.append( '\\' );
685 else if( cQuote
&& (cQuote
==nNextCh
) )
688 sTmpBuffer
.append( nNextCh
);
689 bEqSignFound
= false;
692 case sal_Unicode(EOF
):
699 sTmpBuffer
.append( nNextCh
);
704 bEqSignFound
= false;
706 sTmpBuffer
.append( nNextCh
);
708 bContinue
= false; // break, String zusammen
714 // If scanning options treat it like a space, ...
715 sTmpBuffer
.append( ' ' );
719 // otherwise it's a separate token.
728 // cr/lf in tag is handled in _GetNextToken()
729 sTmpBuffer
.append( nNextCh
);
732 else if( bReadListing
|| bReadXMP
|| bReadPRE
|| bReadTextArea
)
737 // Reduce sequence of CR/LF/BLANK/TAB to a single blank
740 if( '\t'==nNextCh
&& bReadPRE
&& '>'!=cBreak
)
742 // Pass Tabs up in <PRE>
748 if( '\x0b'==nNextCh
&& (bReadPRE
|| bReadXMP
||bReadListing
) &&
756 sTmpBuffer
.append( nNextCh
);
757 if( '>'!=cBreak
&& (!bReadListing
&& !bReadXMP
&&
758 !bReadPRE
&& !bReadTextArea
) )
760 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
762 if( sal_Unicode(EOF
) == (nNextCh
= GetNextChar()) &&
765 if( !aToken
.isEmpty() || sTmpBuffer
.getLength() > 1L )
767 // Have seen s.th. aside from blanks?
768 aToken
+= sTmpBuffer
.makeStringAndClear();
769 return HTML_TEXTTOKEN
;
772 // Only read blanks: no text must be returned
773 // and _GetNextToken has to read until EOF
776 } while ( ' ' == nNextCh
|| '\t' == nNextCh
||
777 '\r' == nNextCh
|| '\n' == nNextCh
||
784 bEqSignFound
= false;
785 if (nNextCh
== cBreak
&& !cQuote
)
790 // All remaining characters make their way into the text.
791 sTmpBuffer
.append( nNextCh
);
792 if( MAX_LEN
== sTmpBuffer
.getLength() )
794 aToken
+= sTmpBuffer
.makeStringAndClear();
796 if( ( sal_Unicode(EOF
) == (nNextCh
= GetNextChar()) &&
800 if( !sTmpBuffer
.isEmpty() )
801 aToken
+= sTmpBuffer
.makeStringAndClear();
802 return HTML_TEXTTOKEN
;
804 } while( HTML_ISALPHA( nNextCh
) || HTML_ISDIGIT( nNextCh
) );
809 if( MAX_LEN
== sTmpBuffer
.getLength() )
810 aToken
+= sTmpBuffer
.makeStringAndClear();
812 if( bContinue
&& bNextCh
)
813 nNextCh
= GetNextChar();
816 if( !sTmpBuffer
.isEmpty() )
817 aToken
+= sTmpBuffer
.makeStringAndClear();
819 return HTML_TEXTTOKEN
;
822 int HTMLParser::_GetNextRawToken()
824 OUStringBuffer
sTmpBuffer( MAX_LEN
);
828 // During the last execution we already found the end token,
829 // thus we don't have to search it again.
833 bEndTokenFound
= false;
838 // Default return value: HTML_RAWDATA
839 bool bContinue
= true;
840 int nToken
= HTML_RAWDATA
;
842 while( bContinue
&& IsParserWorking() )
849 // Maybe we've reached the end.
851 // Save what we have read previously...
852 aToken
+= sTmpBuffer
.makeStringAndClear();
854 // and remember position in stream.
855 sal_uLong nStreamPos
= rInput
.Tell();
856 sal_uLong nLineNr
= GetLineNr();
857 sal_uLong nLinePos
= GetLinePos();
859 // Start of an end token?
860 bool bOffState
= false;
861 if( '/' == (nNextCh
= GetNextChar()) )
864 nNextCh
= GetNextChar();
866 else if( '!' == nNextCh
)
868 sTmpBuffer
.append( nNextCh
);
869 nNextCh
= GetNextChar();
872 // Read following letters
873 while( (HTML_ISALPHA(nNextCh
) || '-'==nNextCh
) &&
874 IsParserWorking() && sTmpBuffer
.getLength() < MAX_LEN
)
876 sTmpBuffer
.append( nNextCh
);
877 nNextCh
= GetNextChar();
880 OUString
aTok( sTmpBuffer
.toString() );
881 aTok
= aTok
.toAsciiLowerCase();
883 if( bReadScript
|| !aEndToken
.isEmpty() )
887 if( aTok
.startsWith( OOO_STRING_SVTOOLS_HTML_comment
) )
893 // A script has to end with "</SCRIPT>". But
894 // ">" is optional for security reasons
897 ? aTok
== OOO_STRING_SVTOOLS_HTML_script
898 : aTok
.equals(aEndToken
) );
901 if( bReadComment
&& '>'==nNextCh
&& aTok
.endsWith( "--" ) )
903 // End of comment of style <!----->
904 bReadComment
= false;
909 // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
911 bDone
= aTok
== OOO_STRING_SVTOOLS_HTML_style
||
912 aTok
== OOO_STRING_SVTOOLS_HTML_head
;
914 bDone
= aTok
== OOO_STRING_SVTOOLS_HTML_body
;
919 // Done! Return the previously read string (if requested)
924 // nToken==0 means, _GetNextToken continues to read
925 if( aToken
.isEmpty() && (bReadStyle
|| bReadScript
) )
927 // Immediately close environment (or context?)
928 // and parse the end token
936 // Keep bReadScript/bReadStyle alive
937 // and parse end token during next execution
938 bEndTokenFound
= true;
941 // Move backwards in stream to '<'
942 rInput
.Seek( nStreamPos
);
943 SetLineNr( nLineNr
);
944 SetLinePos( nLinePos
);
945 ClearTxtConvContext();
948 // Don't append string to token.
949 sTmpBuffer
.setLength( 0L );
953 // remember "</" , everything else we find in the buffer
963 sTmpBuffer
.append( nNextCh
);
966 bool bTwoMinus
= false;
967 nNextCh
= GetNextChar();
968 while( '-' == nNextCh
&& IsParserWorking() )
972 if( MAX_LEN
== sTmpBuffer
.getLength() )
973 aToken
+= sTmpBuffer
.makeStringAndClear();
974 sTmpBuffer
.append( nNextCh
);
975 nNextCh
= GetNextChar();
978 if( '>' == nNextCh
&& IsParserWorking() && bTwoMinus
)
979 bReadComment
= false;
986 // \r\n? closes the current text token (even if it's empty)
987 nNextCh
= GetNextChar();
989 nNextCh
= GetNextChar();
993 // \n closes the current text token (even if it's empty)
994 nNextCh
= GetNextChar();
997 case sal_Unicode(EOF
):
998 // eof closes the current text token and behaves like having read
1000 if( rInput
.IsEof() )
1003 if( !aToken
.isEmpty() || !sTmpBuffer
.isEmpty() )
1005 bEndTokenFound
= true;
1009 bReadScript
= false;
1018 // all remaining characters are appended to the buffer
1019 sTmpBuffer
.append( nNextCh
);
1023 if( (!bContinue
&& !sTmpBuffer
.isEmpty()) ||
1024 MAX_LEN
== sTmpBuffer
.getLength() )
1025 aToken
+= sTmpBuffer
.makeStringAndClear();
1027 if( bContinue
&& bNextCh
)
1028 nNextCh
= GetNextChar();
1031 if( IsParserWorking() )
1040 int HTMLParser::_GetNextToken()
1045 if (mnPendingOffToken
)
1047 // HTML_<TOKEN>_OFF generated for HTML_<TOKEN>_ON
1048 nRet
= mnPendingOffToken
;
1049 mnPendingOffToken
= 0;
1055 if (!maOptions
.empty())
1058 if( !IsParserWorking() ) // Don't continue if already an error occurred
1061 bool bReadNextCharSave
= bReadNextChar
;
1064 DBG_ASSERT( !bEndTokenFound
,
1065 "Read a character despite </SCRIPT> was read?" );
1066 nNextCh
= GetNextChar();
1067 if( !IsParserWorking() ) // Don't continue if already an error occurred
1069 bReadNextChar
= false;
1072 if( bReadScript
|| bReadStyle
|| !aEndToken
.isEmpty() )
1074 nRet
= _GetNextRawToken();
1075 if( nRet
|| !IsParserWorking() )
1080 bool bNextCh
= true;
1085 sal_uLong nStreamPos
= rInput
.Tell();
1086 sal_uLong nLineNr
= GetLineNr();
1087 sal_uLong nLinePos
= GetLinePos();
1089 bool bOffState
= false;
1090 if( '/' == (nNextCh
= GetNextChar()) )
1093 nNextCh
= GetNextChar();
1095 if( HTML_ISALPHA( nNextCh
) || '!'==nNextCh
)
1097 OUStringBuffer sTmpBuffer
;
1099 sTmpBuffer
.append( nNextCh
);
1100 if( MAX_LEN
== sTmpBuffer
.getLength() )
1101 aToken
+= sTmpBuffer
.makeStringAndClear();
1102 nNextCh
= GetNextChar();
1103 } while( '>' != nNextCh
&& '/' != nNextCh
&& !HTML_ISSPACE( nNextCh
) &&
1104 IsParserWorking() && !rInput
.IsEof() );
1106 if( !sTmpBuffer
.isEmpty() )
1107 aToken
+= sTmpBuffer
.makeStringAndClear();
1110 while( HTML_ISSPACE( nNextCh
) && IsParserWorking() )
1111 nNextCh
= GetNextChar();
1113 if( !IsParserWorking() )
1115 if( SVPAR_PENDING
== eState
)
1116 bReadNextChar
= bReadNextCharSave
;
1120 // Search token in table:
1121 sSaveToken
= aToken
;
1122 aToken
= aToken
.toAsciiLowerCase();
1123 if( 0 == (nRet
= GetHTMLToken( aToken
)) )
1125 nRet
= HTML_UNKNOWNCONTROL_ON
;
1127 // If it's a token which can be switched off...
1130 if( HTML_TOKEN_ONOFF
& nRet
)
1132 // and there is an off token, return off token instead
1135 else if( HTML_LINEBREAK
!=nRet
)
1137 // and there is no off token, return unknown token.
1138 // (except for </BR>, that is treated like <BR>)
1139 nRet
= HTML_UNKNOWNCONTROL_OFF
;
1143 if( nRet
== HTML_COMMENT
)
1145 // fix: due to being case sensitive use sSaveToken as start of comment
1146 // and append a blank.
1147 aToken
= sSaveToken
;
1150 sal_uLong nCStreamPos
= 0;
1151 sal_uLong nCLineNr
= 0;
1152 sal_uLong nCLinePos
= 0;
1153 sal_Int32 nCStrLen
= 0;
1156 // Read until closing -->. If not found restart at first >
1157 while( !bDone
&& !rInput
.IsEof() && IsParserWorking() )
1163 nCStreamPos
= rInput
.Tell();
1164 nCStrLen
= aToken
.getLength();
1165 nCLineNr
= GetLineNr();
1166 nCLinePos
= GetLinePos();
1168 bDone
= aToken
.endsWith( "--" );
1170 aToken
+= OUString(nNextCh
);
1173 aToken
+= OUString(nNextCh
);
1175 nNextCh
= GetNextChar();
1177 if( !bDone
&& IsParserWorking() && nCStreamPos
)
1179 rInput
.Seek( nCStreamPos
);
1180 SetLineNr( nCLineNr
);
1181 SetLinePos( nCLinePos
);
1182 ClearTxtConvContext();
1183 aToken
= aToken
.copy(0, nCStrLen
);
1189 // TokenString not needed anymore
1193 // Read until closing '>'
1194 if( '>' != nNextCh
&& IsParserWorking() )
1198 // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1199 // generate pending HTML_<TOKEN>_OFF for HTML_<TOKEN>_ON
1200 // Do not convert this to a single HTML_<TOKEN>_OFF
1201 // which lead to fdo#56772.
1202 if ((HTML_TOKEN_ONOFF
& nRet
) && aToken
.endsWith("/"))
1204 mnPendingOffToken
= nRet
+ 1; // HTML_<TOKEN>_ON -> HTML_<TOKEN>_OFF
1205 aToken
= aToken
.replaceAt( aToken
.getLength()-1, 1, ""); // remove trailing '/'
1207 if( sal_Unicode(EOF
) == nNextCh
&& rInput
.IsEof() )
1209 // Move back in front of < and restart there.
1210 // Return < as text.
1211 rInput
.Seek( nStreamPos
);
1212 SetLineNr( nLineNr
);
1213 SetLinePos( nLinePos
);
1214 ClearTxtConvContext();
1217 nRet
= HTML_TEXTTOKEN
;
1218 nNextCh
= GetNextChar();
1223 if( SVPAR_PENDING
== eState
)
1224 bReadNextChar
= bReadNextCharSave
;
1230 // einfach alles wegschmeissen
1232 if( sal_Unicode(EOF
) == nNextCh
&& rInput
.IsEof() )
1234 // Move back in front of < and restart there.
1235 // Return < as text.
1236 rInput
.Seek( nStreamPos
);
1237 SetLineNr( nLineNr
);
1238 SetLinePos( nLinePos
);
1239 ClearTxtConvContext();
1242 nRet
= HTML_TEXTTOKEN
;
1243 nNextCh
= GetNextChar();
1247 if( SVPAR_PENDING
== eState
)
1248 bReadNextChar
= bReadNextCharSave
;
1251 else if( '%' == nNextCh
)
1253 nRet
= HTML_UNKNOWNCONTROL_ON
;
1255 sal_uLong nCStreamPos
= rInput
.Tell();
1256 sal_uLong nCLineNr
= GetLineNr(), nCLinePos
= GetLinePos();
1259 // Read until closing %>. If not found restart at first >.
1260 while( !bDone
&& !rInput
.IsEof() && IsParserWorking() )
1262 bDone
= '>'==nNextCh
&& aToken
.endsWith("%");
1265 aToken
+= OUString(nNextCh
);
1266 nNextCh
= GetNextChar();
1269 if( !bDone
&& IsParserWorking() )
1271 rInput
.Seek( nCStreamPos
);
1272 SetLineNr( nCLineNr
);
1273 SetLinePos( nCLinePos
);
1274 ClearTxtConvContext();
1276 nRet
= HTML_TEXTTOKEN
;
1279 if( IsParserWorking() )
1281 sSaveToken
= aToken
;
1288 nRet
= HTML_TEXTTOKEN
;
1294 if( IsParserWorking() )
1296 bNextCh
= '>' == nNextCh
;
1299 case HTML_TEXTAREA_ON
:
1300 bReadTextArea
= true;
1302 case HTML_TEXTAREA_OFF
:
1303 bReadTextArea
= false;
1305 case HTML_SCRIPT_ON
:
1306 if( !bReadTextArea
)
1309 case HTML_SCRIPT_OFF
:
1310 if( !bReadTextArea
)
1312 bReadScript
= false;
1313 // JavaScript might modify the stream,
1314 // thus the last character has to be read again.
1315 bReadNextChar
= true;
1323 case HTML_STYLE_OFF
:
1331 case sal_Unicode(EOF
):
1332 if( rInput
.IsEof() )
1334 eState
= SVPAR_ACCEPTED
;
1339 // Read normal text.
1345 // form feeds are passed upwards separately
1346 nRet
= HTML_LINEFEEDCHAR
; // !!! should be FORMFEEDCHAR
1351 if( bReadListing
|| bReadXMP
|| bReadPRE
|| bReadTextArea
)
1353 sal_Unicode c
= GetNextChar();
1354 if( ( '\n' != nNextCh
|| '\r' != c
) &&
1355 ( '\r' != nNextCh
|| '\n' != c
) )
1360 nRet
= HTML_NEWPARA
;
1367 nRet
= HTML_TABCHAR
;
1376 // "normal" text to come
1378 bNextCh
= 0 == aToken
.getLength();
1380 // the text should be processed
1381 if( !bNextCh
&& eState
== SVPAR_PENDING
)
1383 eState
= SVPAR_WORKING
;
1384 bReadNextChar
= true;
1390 if( bNextCh
&& SVPAR_WORKING
== eState
)
1392 nNextCh
= GetNextChar();
1393 if( SVPAR_PENDING
== eState
&& nRet
&& HTML_TEXTTOKEN
!= nRet
)
1395 bReadNextChar
= true;
1396 eState
= SVPAR_WORKING
;
1400 } while( !nRet
&& SVPAR_WORKING
== eState
);
1402 if( SVPAR_PENDING
== eState
)
1403 nRet
= -1; // s.th. invalid
1408 void HTMLParser::UnescapeToken()
1412 bool bEscape
= false;
1413 while( nPos
< aToken
.getLength() )
1415 bool bOldEscape
= bEscape
;
1417 if( '\\'==aToken
[nPos
] && !bOldEscape
)
1419 aToken
= aToken
.replaceAt( nPos
, 1, "" );
1429 const HTMLOptions
& HTMLParser::GetOptions( sal_uInt16
*pNoConvertToken
)
1431 // If the options for the current token have already been returned,
1432 // return them once again.
1433 if (!maOptions
.empty())
1437 while( nPos
< aToken
.getLength() )
1439 // A letter? Option beginning here.
1440 if( HTML_ISALPHA( aToken
[nPos
] ) )
1444 sal_Int32 nStt
= nPos
;
1445 sal_Unicode cChar
= 0;
1447 // Actually only certain characters allowed.
1448 // Netscape only looks for "=" and white space (c.f.
1449 // Mozilla: PA_FetchRequestedNameValues in lipparse/pa_mdl.c)
1450 while( nPos
< aToken
.getLength() && '=' != (cChar
=aToken
[nPos
]) &&
1451 HTML_ISPRINTABLE(cChar
) && !HTML_ISSPACE(cChar
) )
1454 OUString
sName( aToken
.copy( nStt
, nPos
-nStt
) );
1456 // PlugIns require original token name. Convert to lower case only for searching.
1457 nToken
= GetHTMLOption( sName
.toAsciiLowerCase() ); // Name is ready
1458 DBG_ASSERTWARNING( nToken
!=HTML_O_UNKNOWN
,
1459 "GetOption: unknown HTML option" );
1460 bool bStripCRLF
= (nToken
< HTML_OPTION_SCRIPT_START
||
1461 nToken
>= HTML_OPTION_SCRIPT_END
) &&
1462 (!pNoConvertToken
|| nToken
!= *pNoConvertToken
);
1464 while( nPos
< aToken
.getLength() &&
1465 ( !HTML_ISPRINTABLE( (cChar
=aToken
[nPos
]) ) ||
1466 HTML_ISSPACE(cChar
) ) )
1469 // Option with value?
1470 if( nPos
!=aToken
.getLength() && '='==cChar
)
1474 while( nPos
< aToken
.getLength() &&
1475 ( !HTML_ISPRINTABLE( (cChar
=aToken
[nPos
]) ) ||
1476 ' '==cChar
|| '\t'==cChar
|| '\r'==cChar
|| '\n'==cChar
) )
1479 if( nPos
!= aToken
.getLength() )
1483 if( ('"'==cChar
) || ('\'')==cChar
)
1485 sal_Unicode cEnd
= cChar
;
1488 bool bEscape
= false;
1489 while( nPos
< aToken
.getLength() && !bDone
)
1491 bool bOldEscape
= bEscape
;
1493 cChar
= aToken
[nPos
];
1499 aToken
= aToken
.replaceAt( nPos
, 1, "" );
1510 aToken
= aToken
.replaceAt( nPos
, 1, "" );
1516 bDone
= !bOldEscape
&& cChar
==cEnd
;
1525 if( nPos
!=aToken
.getLength() )
1530 // More liberal than the standard: allow all printable characters
1531 bool bEscape
= false;
1533 while( nPos
< aToken
.getLength() && !bDone
)
1535 bool bOldEscape
= bEscape
;
1537 sal_Unicode c
= aToken
[nPos
];
1541 bDone
= !bOldEscape
;
1559 aToken
= aToken
.replaceAt( nPos
, 1, "" );
1565 if( HTML_ISPRINTABLE( c
) )
1575 aValue
= aToken
.copy( nStt
, nLen
);
1579 // Token is known and can be saved
1580 std::unique_ptr
<HTMLOption
> pOption(
1581 new HTMLOption(sal::static_int_cast
<sal_uInt16
>(nToken
), sName
, aValue
));
1583 o3tl::ptr_container::push_back(maOptions
, std::move(pOption
));
1586 // Ignore white space and unexpected characters
1593 int HTMLParser::FilterPRE( int nToken
)
1597 // in Netscape they only have impact in not empty paragraphs
1598 case HTML_PARABREAK_ON
:
1599 nToken
= HTML_LINEBREAK
;
1601 case HTML_LINEBREAK
:
1604 if( bPre_IgnoreNewPara
)
1610 sal_Int32 nSpaces
= (8 - (nPre_LinePos
% 8));
1611 DBG_ASSERT( aToken
.isEmpty(), "Why is the token not empty?" );
1612 if (aToken
.getLength() < nSpaces
)
1614 using comphelper::string::padToLength
;
1615 OUStringBuffer
aBuf(aToken
);
1616 aToken
= padToLength(aBuf
, nSpaces
, ' ').makeStringAndClear();
1618 nPre_LinePos
+= nSpaces
;
1619 nToken
= HTML_TEXTTOKEN
;
1623 case HTML_TEXTTOKEN
:
1624 nPre_LinePos
+= aToken
.getLength();
1627 case HTML_SELECT_ON
:
1628 case HTML_SELECT_OFF
:
1634 case HTML_TEXTAREA_ON
:
1635 case HTML_TEXTAREA_OFF
:
1638 case HTML_APPLET_ON
:
1639 case HTML_APPLET_OFF
:
1644 case HTML_HEAD1_OFF
:
1646 case HTML_HEAD2_OFF
:
1648 case HTML_HEAD3_OFF
:
1650 case HTML_HEAD4_OFF
:
1652 case HTML_HEAD5_OFF
:
1654 case HTML_HEAD6_OFF
:
1655 case HTML_BLOCKQUOTE_ON
:
1656 case HTML_BLOCKQUOTE_OFF
:
1657 case HTML_ADDRESS_ON
:
1658 case HTML_ADDRESS_OFF
:
1661 case HTML_CENTER_ON
:
1662 case HTML_CENTER_OFF
:
1663 case HTML_DIVISION_ON
:
1664 case HTML_DIVISION_OFF
:
1666 case HTML_SCRIPT_ON
:
1667 case HTML_SCRIPT_OFF
:
1671 case HTML_TABLE_OFF
:
1672 case HTML_CAPTION_ON
:
1673 case HTML_CAPTION_OFF
:
1674 case HTML_COLGROUP_ON
:
1675 case HTML_COLGROUP_OFF
:
1679 case HTML_THEAD_OFF
:
1681 case HTML_TFOOT_OFF
:
1683 case HTML_TBODY_OFF
:
1684 case HTML_TABLEROW_ON
:
1685 case HTML_TABLEROW_OFF
:
1686 case HTML_TABLEDATA_ON
:
1687 case HTML_TABLEDATA_OFF
:
1688 case HTML_TABLEHEADER_ON
:
1689 case HTML_TABLEHEADER_OFF
:
1691 case HTML_ANCHOR_ON
:
1692 case HTML_ANCHOR_OFF
:
1695 case HTML_ITALIC_ON
:
1696 case HTML_ITALIC_OFF
:
1697 case HTML_STRIKE_ON
:
1698 case HTML_STRIKE_OFF
:
1699 case HTML_STRIKETHROUGH_ON
:
1700 case HTML_STRIKETHROUGH_OFF
:
1701 case HTML_UNDERLINE_ON
:
1702 case HTML_UNDERLINE_OFF
:
1703 case HTML_BASEFONT_ON
:
1704 case HTML_BASEFONT_OFF
:
1708 case HTML_BLINK_OFF
:
1711 case HTML_SUBSCRIPT_ON
:
1712 case HTML_SUBSCRIPT_OFF
:
1713 case HTML_SUPERSCRIPT_ON
:
1714 case HTML_SUPERSCRIPT_OFF
:
1715 case HTML_BIGPRINT_ON
:
1716 case HTML_BIGPRINT_OFF
:
1717 case HTML_SMALLPRINT_OFF
:
1718 case HTML_SMALLPRINT_ON
:
1720 case HTML_EMPHASIS_ON
:
1721 case HTML_EMPHASIS_OFF
:
1722 case HTML_CITIATION_ON
:
1723 case HTML_CITIATION_OFF
:
1724 case HTML_STRONG_ON
:
1725 case HTML_STRONG_OFF
:
1728 case HTML_SAMPLE_ON
:
1729 case HTML_SAMPLE_OFF
:
1730 case HTML_KEYBOARD_ON
:
1731 case HTML_KEYBOARD_OFF
:
1732 case HTML_VARIABLE_ON
:
1733 case HTML_VARIABLE_OFF
:
1734 case HTML_DEFINSTANCE_ON
:
1735 case HTML_DEFINSTANCE_OFF
:
1736 case HTML_SHORTQUOTE_ON
:
1737 case HTML_SHORTQUOTE_OFF
:
1738 case HTML_LANGUAGE_ON
:
1739 case HTML_LANGUAGE_OFF
:
1740 case HTML_AUTHOR_ON
:
1741 case HTML_AUTHOR_OFF
:
1742 case HTML_PERSON_ON
:
1743 case HTML_PERSON_OFF
:
1744 case HTML_ACRONYM_ON
:
1745 case HTML_ACRONYM_OFF
:
1746 case HTML_ABBREVIATION_ON
:
1747 case HTML_ABBREVIATION_OFF
:
1748 case HTML_INSERTEDTEXT_ON
:
1749 case HTML_INSERTEDTEXT_OFF
:
1750 case HTML_DELETEDTEXT_ON
:
1751 case HTML_DELETEDTEXT_OFF
:
1752 case HTML_TELETYPE_ON
:
1753 case HTML_TELETYPE_OFF
:
1757 // The remainder is treated as an unknown token.
1762 ( ((HTML_TOKEN_ONOFF
& nToken
) && (1 & nToken
))
1763 ? HTML_UNKNOWNCONTROL_OFF
1764 : HTML_UNKNOWNCONTROL_ON
);
1769 bPre_IgnoreNewPara
= false;
1774 int HTMLParser::FilterXMP( int nToken
)
1779 if( bPre_IgnoreNewPara
)
1781 case HTML_TEXTTOKEN
:
1782 case HTML_NONBREAKSPACE
:
1789 if( (HTML_TOKEN_ONOFF
& nToken
) && (1 & nToken
) )
1791 sSaveToken
= "</" + sSaveToken
;
1794 sSaveToken
= "<" + sSaveToken
;
1795 if( !aToken
.isEmpty() )
1799 aToken
= sSaveToken
+ aToken
;
1802 aToken
= sSaveToken
;
1804 nToken
= HTML_TEXTTOKEN
;
1809 bPre_IgnoreNewPara
= false;
1814 int HTMLParser::FilterListing( int nToken
)
1819 if( bPre_IgnoreNewPara
)
1821 case HTML_TEXTTOKEN
:
1822 case HTML_NONBREAKSPACE
:
1830 ( ((HTML_TOKEN_ONOFF
& nToken
) && (1 & nToken
))
1831 ? HTML_UNKNOWNCONTROL_OFF
1832 : HTML_UNKNOWNCONTROL_ON
);
1837 bPre_IgnoreNewPara
= false;
1842 bool HTMLParser::InternalImgToPrivateURL( OUString
& rURL
)
1844 bool bFound
= false;
1846 if( rURL
.startsWith( OOO_STRING_SVTOOLS_HTML_internal_icon
) )
1848 OUString
aName( rURL
.copy(14) );
1852 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata
;
1855 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed
;
1858 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_embed
;
1861 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure
;
1864 bFound
= aName
== OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound
;
1870 OUString
sTmp ( rURL
);
1871 rURL
= OOO_STRING_SVTOOLS_HTML_private_image
;
1881 HTML_META_DESCRIPTION
,
1884 HTML_META_CLASSIFICATION
,
1886 HTML_META_CHANGEDBY
,
1888 HTML_META_GENERATOR
,
1889 HTML_META_SDFOOTNOTE
,
1890 HTML_META_SDENDNOTE
,
1891 HTML_META_CONTENT_TYPE
1895 static HTMLOptionEnum
const aHTMLMetaNameTable
[] =
1897 { OOO_STRING_SVTOOLS_HTML_META_author
, HTML_META_AUTHOR
},
1898 { OOO_STRING_SVTOOLS_HTML_META_changed
, HTML_META_CHANGED
},
1899 { OOO_STRING_SVTOOLS_HTML_META_changedby
, HTML_META_CHANGEDBY
},
1900 { OOO_STRING_SVTOOLS_HTML_META_classification
,HTML_META_CLASSIFICATION
},
1901 { OOO_STRING_SVTOOLS_HTML_META_content_type
, HTML_META_CONTENT_TYPE
},
1902 { OOO_STRING_SVTOOLS_HTML_META_created
, HTML_META_CREATED
},
1903 { OOO_STRING_SVTOOLS_HTML_META_description
, HTML_META_DESCRIPTION
},
1904 { OOO_STRING_SVTOOLS_HTML_META_keywords
, HTML_META_KEYWORDS
},
1905 { OOO_STRING_SVTOOLS_HTML_META_generator
, HTML_META_GENERATOR
},
1906 { OOO_STRING_SVTOOLS_HTML_META_refresh
, HTML_META_REFRESH
},
1907 { OOO_STRING_SVTOOLS_HTML_META_sdendnote
, HTML_META_SDENDNOTE
},
1908 { OOO_STRING_SVTOOLS_HTML_META_sdfootnote
, HTML_META_SDFOOTNOTE
},
1913 void HTMLParser::AddMetaUserDefined( OUString
const & )
1917 bool HTMLParser::ParseMetaOptionsImpl(
1918 const uno::Reference
<document::XDocumentProperties
> & i_xDocProps
,
1919 SvKeyValueIterator
*i_pHTTPHeader
,
1920 const HTMLOptions
& aOptions
,
1921 rtl_TextEncoding
& o_rEnc
)
1923 OUString aName
, aContent
;
1924 sal_uInt16 nAction
= HTML_META_NONE
;
1925 bool bHTTPEquiv
= false, bChanged
= false;
1927 for ( size_t i
= aOptions
.size(); i
; )
1929 const HTMLOption
& aOption
= aOptions
[--i
];
1930 switch ( aOption
.GetToken() )
1933 aName
= aOption
.GetString();
1934 if ( HTML_META_NONE
==nAction
)
1936 aOption
.GetEnum( nAction
, aHTMLMetaNameTable
);
1939 case HTML_O_HTTPEQUIV
:
1940 aName
= aOption
.GetString();
1941 aOption
.GetEnum( nAction
, aHTMLMetaNameTable
);
1944 case HTML_O_CONTENT
:
1945 aContent
= aOption
.GetString();
1950 if ( bHTTPEquiv
|| HTML_META_DESCRIPTION
!= nAction
)
1952 // if it is not a Description, remove CRs and LFs from CONTENT
1953 aContent
= comphelper::string::remove(aContent
, '\r');
1954 aContent
= comphelper::string::remove(aContent
, '\n');
1958 // convert line endings for Description
1959 aContent
= convertLineEnd(aContent
, GetSystemLineEnd());
1963 if ( bHTTPEquiv
&& i_pHTTPHeader
)
1965 // Netscape seems to just ignore a closing ", so we do too
1966 if ( aContent
.endsWith("\"") )
1968 aContent
= aContent
.copy( 0, aContent
.getLength() - 1 );
1970 SvKeyValue
aKeyValue( aName
, aContent
);
1971 i_pHTTPHeader
->Append( aKeyValue
);
1976 case HTML_META_AUTHOR
:
1977 if (i_xDocProps
.is()) {
1978 i_xDocProps
->setAuthor( aContent
);
1982 case HTML_META_DESCRIPTION
:
1983 if (i_xDocProps
.is()) {
1984 i_xDocProps
->setDescription( aContent
);
1988 case HTML_META_KEYWORDS
:
1989 if (i_xDocProps
.is()) {
1990 i_xDocProps
->setKeywords(
1991 ::comphelper::string::convertCommaSeparated(aContent
));
1995 case HTML_META_CLASSIFICATION
:
1996 if (i_xDocProps
.is()) {
1997 i_xDocProps
->setSubject( aContent
);
2002 case HTML_META_CHANGEDBY
:
2003 if (i_xDocProps
.is()) {
2004 i_xDocProps
->setModifiedBy( aContent
);
2008 case HTML_META_CREATED
:
2009 case HTML_META_CHANGED
:
2010 if ( i_xDocProps
.is() && !aContent
.isEmpty() &&
2011 comphelper::string::getTokenCount(aContent
, ';') == 2 )
2013 Date
aDate( (sal_uLong
)aContent
.getToken(0, ';').toInt32() );
2014 tools::Time
aTime( (sal_uLong
)aContent
.getToken(1, ';').toInt32() );
2015 DateTime
aDateTime( aDate
, aTime
);
2016 ::util::DateTime uDT
= aDateTime
.GetUNODateTime();
2017 if ( HTML_META_CREATED
==nAction
)
2018 i_xDocProps
->setCreationDate( uDT
);
2020 i_xDocProps
->setModificationDate( uDT
);
2025 case HTML_META_REFRESH
:
2026 DBG_ASSERT( !bHTTPEquiv
|| i_pHTTPHeader
,
2027 "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" );
2030 case HTML_META_CONTENT_TYPE
:
2031 if ( !aContent
.isEmpty() )
2033 o_rEnc
= GetEncodingByMIME( aContent
);
2037 case HTML_META_NONE
:
2040 if (i_xDocProps
.is())
2042 uno::Reference
<beans::XPropertyContainer
> xUDProps
2043 = i_xDocProps
->getUserDefinedProperties();
2045 xUDProps
->addProperty(aName
,
2046 beans::PropertyAttribute::REMOVABLE
,
2047 uno::makeAny(OUString(aContent
)));
2048 AddMetaUserDefined(aName
);
2050 } catch (uno::Exception
&) {
2063 bool HTMLParser::ParseMetaOptions(
2064 const uno::Reference
<document::XDocumentProperties
> & i_xDocProps
,
2065 SvKeyValueIterator
*i_pHeader
)
2067 sal_uInt16 nContentOption
= HTML_O_CONTENT
;
2068 rtl_TextEncoding eEnc
= RTL_TEXTENCODING_DONTKNOW
;
2070 bool bRet
= ParseMetaOptionsImpl( i_xDocProps
, i_pHeader
,
2071 GetOptions(&nContentOption
),
2074 // If the encoding is set by a META tag, it may only overwrite the
2075 // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2076 // encodings. Everything else cannot lead to reasonable results.
2077 if (RTL_TEXTENCODING_DONTKNOW
!= eEnc
&&
2078 rtl_isOctetTextEncoding( eEnc
) &&
2079 rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2081 eEnc
= GetExtendedCompatibilityTextEncoding( eEnc
);
2082 SetSrcEncoding( eEnc
);
2088 rtl_TextEncoding
HTMLParser::GetEncodingByMIME( const OUString
& rMime
)
2092 INetContentTypeParameterList aParameters
;
2093 if (INetContentTypes::parse(rMime
, sType
, sSubType
, &aParameters
))
2095 const INetContentTypeParameter
* pCharset
= aParameters
.find("charset");
2098 OString
sValue(OUStringToOString(pCharset
->m_sValue
, RTL_TEXTENCODING_ASCII_US
));
2099 return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue
.getStr() ) );
2102 return RTL_TEXTENCODING_DONTKNOW
;
2105 rtl_TextEncoding
HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator
*pHTTPHeader
)
2107 rtl_TextEncoding eRet
= RTL_TEXTENCODING_DONTKNOW
;
2111 for( bool bCont
= pHTTPHeader
->GetFirst( aKV
); bCont
;
2112 bCont
= pHTTPHeader
->GetNext( aKV
) )
2114 if( aKV
.GetKey().equalsIgnoreAsciiCase( OOO_STRING_SVTOOLS_HTML_META_content_type
) )
2116 if( !aKV
.GetValue().isEmpty() )
2118 eRet
= HTMLParser::GetEncodingByMIME( aKV
.GetValue() );
2126 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator
*pHTTPHeader
)
2129 rtl_TextEncoding eEnc
= HTMLParser::GetEncodingByHttpHeader( pHTTPHeader
);
2130 if(RTL_TEXTENCODING_DONTKNOW
!= eEnc
)
2132 SetSrcEncoding( eEnc
);
2139 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */