1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
23 #include <comphelper/string.hxx>
24 #include <tools/stream.hxx>
25 #include <tools/debug.hxx>
26 #include <tools/color.hxx>
27 #include <rtl/ustrbuf.hxx>
28 #include <rtl/strbuf.hxx>
30 #include <tools/tenccvt.hxx>
31 #include <tools/datetime.hxx>
32 #include <svl/inettype.hxx>
33 #include <com/sun/star/beans/PropertyAttribute.hpp>
34 #include <com/sun/star/document/XDocumentProperties.hpp>
36 #include <svtools/parhtml.hxx>
37 #include <svtools/htmltokn.h>
38 #include <svtools/htmlkywd.hxx>
42 using namespace ::com::sun::star
;
45 const sal_Int32
MAX_LEN( 1024L );
47 const sal_Int32
MAX_ENTITY_LEN( 8L );
50 // Tables to convert option values into strings
53 static HTMLOptionEnum
const aInputTypeOptEnums
[] =
55 { OOO_STRING_SVTOOLS_HTML_IT_text
, HTML_IT_TEXT
},
56 { OOO_STRING_SVTOOLS_HTML_IT_password
, HTML_IT_PASSWORD
},
57 { OOO_STRING_SVTOOLS_HTML_IT_checkbox
, HTML_IT_CHECKBOX
},
58 { OOO_STRING_SVTOOLS_HTML_IT_radio
, HTML_IT_RADIO
},
59 { OOO_STRING_SVTOOLS_HTML_IT_range
, HTML_IT_RANGE
},
60 { OOO_STRING_SVTOOLS_HTML_IT_scribble
, HTML_IT_SCRIBBLE
},
61 { OOO_STRING_SVTOOLS_HTML_IT_file
, HTML_IT_FILE
},
62 { OOO_STRING_SVTOOLS_HTML_IT_hidden
, HTML_IT_HIDDEN
},
63 { OOO_STRING_SVTOOLS_HTML_IT_submit
, HTML_IT_SUBMIT
},
64 { OOO_STRING_SVTOOLS_HTML_IT_image
, HTML_IT_IMAGE
},
65 { OOO_STRING_SVTOOLS_HTML_IT_reset
, HTML_IT_RESET
},
66 { OOO_STRING_SVTOOLS_HTML_IT_button
, HTML_IT_BUTTON
},
71 static HTMLOptionEnum
const aTableFrameOptEnums
[] =
73 { OOO_STRING_SVTOOLS_HTML_TF_void
, HTML_TF_VOID
},
74 { OOO_STRING_SVTOOLS_HTML_TF_above
, HTML_TF_ABOVE
},
75 { OOO_STRING_SVTOOLS_HTML_TF_below
, HTML_TF_BELOW
},
76 { OOO_STRING_SVTOOLS_HTML_TF_hsides
, HTML_TF_HSIDES
},
77 { OOO_STRING_SVTOOLS_HTML_TF_lhs
, HTML_TF_LHS
},
78 { OOO_STRING_SVTOOLS_HTML_TF_rhs
, HTML_TF_RHS
},
79 { OOO_STRING_SVTOOLS_HTML_TF_vsides
, HTML_TF_VSIDES
},
80 { OOO_STRING_SVTOOLS_HTML_TF_box
, HTML_TF_BOX
},
81 { OOO_STRING_SVTOOLS_HTML_TF_border
, HTML_TF_BOX
},
86 static HTMLOptionEnum
const aTableRulesOptEnums
[] =
88 { OOO_STRING_SVTOOLS_HTML_TR_none
, HTML_TR_NONE
},
89 { OOO_STRING_SVTOOLS_HTML_TR_groups
, HTML_TR_GROUPS
},
90 { OOO_STRING_SVTOOLS_HTML_TR_rows
, HTML_TR_ROWS
},
91 { OOO_STRING_SVTOOLS_HTML_TR_cols
, HTML_TR_COLS
},
92 { OOO_STRING_SVTOOLS_HTML_TR_all
, HTML_TR_ALL
},
96 sal_uInt16
HTMLOption::GetEnum( const HTMLOptionEnum
*pOptEnums
, sal_uInt16 nDflt
) const
98 sal_uInt16 nValue
= nDflt
;
100 while( pOptEnums
->pName
)
101 if( aValue
.EqualsIgnoreCaseAscii( pOptEnums
->pName
) )
106 if( pOptEnums
->pName
)
107 nValue
= pOptEnums
->nValue
;
112 bool HTMLOption::GetEnum( sal_uInt16
&rEnum
, const HTMLOptionEnum
*pOptEnums
) const
114 while( pOptEnums
->pName
)
116 if( aValue
.EqualsIgnoreCaseAscii( pOptEnums
->pName
) )
122 const sal_Char
*pName
= pOptEnums
->pName
;
124 rEnum
= pOptEnums
->nValue
;
129 HTMLOption::HTMLOption( sal_uInt16 nTok
, const String
& rToken
,
130 const String
& rValue
)
135 DBG_ASSERT( nToken
>=HTML_OPTION_START
&& nToken
<HTML_OPTION_END
,
136 "HTMLOption: unknown token" );
139 sal_uInt32
HTMLOption::GetNumber() const
141 DBG_ASSERT( (nToken
>=HTML_OPTION_NUMBER_START
&&
142 nToken
<HTML_OPTION_NUMBER_END
) ||
143 (nToken
>=HTML_OPTION_CONTEXT_START
&&
144 nToken
<HTML_OPTION_CONTEXT_END
) ||
145 nToken
==HTML_O_VALUE
,
146 "GetNumber: Option not numerical" );
147 String
aTmp(comphelper::string::stripStart(aValue
, ' '));
148 sal_Int32 nTmp
= aTmp
.ToInt32();
149 return nTmp
>= 0 ? (sal_uInt32
)nTmp
: 0;
152 sal_Int32
HTMLOption::GetSNumber() const
154 DBG_ASSERT( (nToken
>=HTML_OPTION_NUMBER_START
&& nToken
<HTML_OPTION_NUMBER_END
) ||
155 (nToken
>=HTML_OPTION_CONTEXT_START
&& nToken
<HTML_OPTION_CONTEXT_END
),
156 "GetSNumber: Option not numerical" );
157 String
aTmp(comphelper::string::stripStart(aValue
, ' '));
158 return aTmp
.ToInt32();
161 void HTMLOption::GetNumbers( std::vector
<sal_uInt32
> &rNumbers
, bool bSpaceDelim
) const
167 // This is a very simplified scanner: it only searches all
168 // numerals in the string.
171 for( xub_StrLen i
=0; i
<aValue
.Len(); i
++ )
173 register sal_Unicode c
= aValue
.GetChar( i
);
174 if( c
>='0' && c
<='9' )
182 rNumbers
.push_back( nNum
);
189 rNumbers
.push_back( nNum
);
194 // Check whether numbers are separated by ',' and
195 // insert 0 if necessary
197 while( nPos
< aValue
.Len() )
199 register sal_Unicode c
;
200 while( nPos
< aValue
.Len() &&
201 ((c
=aValue
.GetChar(nPos
)) == ' ' || c
== '\t' ||
202 c
== '\n' || c
== '\r' ) )
205 if( nPos
==aValue
.Len() )
206 rNumbers
.push_back(0);
209 xub_StrLen nEnd
= aValue
.Search( (sal_Unicode
)',', nPos
);
210 if( STRING_NOTFOUND
==nEnd
)
212 sal_Int32 nTmp
= aValue
.Copy(nPos
).ToInt32();
213 rNumbers
.push_back( nTmp
>= 0 ? (sal_uInt32
)nTmp
: 0 );
219 aValue
.Copy(nPos
,nEnd
-nPos
).ToInt32();
220 rNumbers
.push_back( nTmp
>= 0 ? (sal_uInt32
)nTmp
: 0 );
228 void HTMLOption::GetColor( Color
& rColor
) const
230 DBG_ASSERT( (nToken
>=HTML_OPTION_COLOR_START
&& nToken
<HTML_OPTION_COLOR_END
) || nToken
==HTML_O_SIZE
,
231 "GetColor: Option is not a color." );
233 String
aTmp( aValue
);
235 sal_uInt32 nColor
= SAL_MAX_UINT32
;
236 if( '#'!=aTmp
.GetChar( 0 ) )
237 nColor
= GetHTMLColor( aTmp
);
239 if( SAL_MAX_UINT32
== nColor
)
243 for( sal_uInt32 i
=0; i
<6; i
++ )
245 // Whatever Netscape does to get color values,
246 // at maximum three characters < '0' are ignored.
247 register sal_Unicode c
= nPos
<aTmp
.Len() ? aTmp
.GetChar( nPos
++ )
251 c
= nPos
<aTmp
.Len() ? aTmp
.GetChar(nPos
++) : '0';
253 c
= nPos
<aTmp
.Len() ? aTmp
.GetChar(nPos
++) : '0';
256 if( c
>= '0' && c
<= '9' )
258 else if( c
>= 'A' && c
<= 'F' )
263 rColor
.SetRed( (sal_uInt8
)((nColor
& 0x00ff0000) >> 16) );
264 rColor
.SetGreen( (sal_uInt8
)((nColor
& 0x0000ff00) >> 8));
265 rColor
.SetBlue( (sal_uInt8
)(nColor
& 0x000000ff) );
268 HTMLInputType
HTMLOption::GetInputType() const
270 DBG_ASSERT( nToken
==HTML_O_TYPE
, "GetInputType: Option not TYPE" );
271 return (HTMLInputType
)GetEnum( aInputTypeOptEnums
, HTML_IT_TEXT
);
274 HTMLTableFrame
HTMLOption::GetTableFrame() const
276 DBG_ASSERT( nToken
==HTML_O_FRAME
, "GetTableFrame: Option not FRAME" );
277 return (HTMLTableFrame
)GetEnum( aTableFrameOptEnums
, HTML_TF_VOID
);
280 HTMLTableRules
HTMLOption::GetTableRules() const
282 DBG_ASSERT( nToken
==HTML_O_RULES
, "GetTableRules: Option not RULES" );
283 return (HTMLTableRules
)GetEnum( aTableRulesOptEnums
, HTML_TR_NONE
);
286 HTMLParser::HTMLParser( SvStream
& rIn
, bool bReadNewDoc
) :
288 bNewDoc(bReadNewDoc
),
294 bReadTextArea(false),
297 bEndTokenFound(false),
298 bPre_IgnoreNewPara(false),
299 bReadNextChar(false),
303 //#i76649, default to UTF-8 for HTML unless we know differently
304 SetSrcEncoding(RTL_TEXTENCODING_UTF8
);
307 HTMLParser::~HTMLParser()
311 SvParserState
HTMLParser::CallParser()
313 eState
= SVPAR_WORKING
;
314 nNextCh
= GetNextChar();
318 bPre_IgnoreNewPara
= false;
322 if( SVPAR_PENDING
!= eState
)
323 ReleaseRef(); // Parser not needed anymore
328 void HTMLParser::Continue( int nToken
)
331 nToken
= GetNextToken();
333 while( IsParserWorking() )
336 nToken
= FilterToken( nToken
);
341 if( IsParserWorking() )
342 SaveState( 0 ); // continue with new token
344 nToken
= GetNextToken();
348 int HTMLParser::FilterToken( int nToken
)
352 case sal_Unicode(EOF
):
359 bIsInHeader
= HTML_HEAD_ON
== nToken
;
363 case HTML_FRAMESET_ON
:
365 bIsInBody
= HTML_BODY_ON
== nToken
;
369 bIsInBody
= bReadPRE
= bReadListing
= bReadXMP
= false;
374 bReadPRE
= bReadListing
= bReadXMP
= false;
375 break; // HTML_ON hasn't been passed either !
377 case HTML_PREFORMTXT_ON
:
381 case HTML_PREFORMTXT_OFF
:
385 case HTML_LISTING_ON
:
389 case HTML_LISTING_OFF
:
403 nToken
= FilterPRE( nToken
);
404 else if( bReadListing
)
405 nToken
= FilterListing( nToken
);
407 nToken
= FilterXMP( nToken
);
415 #define HTML_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
416 #define HTML_ISALPHA( c ) comphelper::string::isalphaAscii(c)
417 #define HTML_ISALNUM( c ) comphelper::string::isalnumAscii(c)
418 #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) )
419 #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127)
420 #define HTML_ISHEXDIGIT( c ) comphelper::string::isxdigitAscii(c)
422 int HTMLParser::ScanText( const sal_Unicode cBreak
)
424 OUStringBuffer
sTmpBuffer( MAX_LEN
);
425 int bContinue
= true;
426 int bEqSignFound
= false;
427 sal_Unicode cQuote
= 0U;
429 while( bContinue
&& IsParserWorking() )
435 bEqSignFound
= false;
437 sTmpBuffer
.append( (sal_Unicode
)'&' );
440 sal_uLong nStreamPos
= rInput
.Tell();
441 sal_uLong nLinePos
= GetLinePos();
443 sal_Unicode cChar
= 0U;
444 if( '#' == (nNextCh
= GetNextChar()) )
446 nNextCh
= GetNextChar();
447 const bool bIsHex( 'x' == nNextCh
);
448 const bool bIsDecOrHex( bIsHex
|| HTML_ISDIGIT(nNextCh
) );
453 nNextCh
= GetNextChar();
454 while ( HTML_ISHEXDIGIT(nNextCh
) )
456 cChar
= cChar
* 16U +
458 ? sal_Unicode( nNextCh
- '0' )
460 ? sal_Unicode( nNextCh
- 'A' + 10 )
461 : sal_Unicode( nNextCh
- 'a' + 10 ) ) );
462 nNextCh
= GetNextChar();
469 cChar
= cChar
* 10U + sal_Unicode( nNextCh
- '0');
470 nNextCh
= GetNextChar();
472 while( HTML_ISDIGIT(nNextCh
) );
475 if( RTL_TEXTENCODING_DONTKNOW
!= eSrcEnc
&&
476 RTL_TEXTENCODING_UCS2
!= eSrcEnc
&&
477 RTL_TEXTENCODING_UTF8
!= eSrcEnc
&&
480 const sal_uInt32 convertFlags
=
481 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT
|
482 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT
|
483 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
;
485 sal_Char cEncodedChar
= static_cast<sal_Char
>(cChar
);
486 cChar
= OUString(&cEncodedChar
, 1, eSrcEnc
, convertFlags
).toChar();
489 // If the character could not be
490 // converted, because a conversion is not
491 // available, do no conversion at all.
492 cChar
= cEncodedChar
;
499 else if( HTML_ISALPHA( nNextCh
) )
501 OUStringBuffer
sEntityBuffer( MAX_ENTITY_LEN
);
502 xub_StrLen nPos
= 0L;
505 sEntityBuffer
.append( nNextCh
);
507 nNextCh
= GetNextChar();
509 while( nPos
< MAX_ENTITY_LEN
&& HTML_ISALNUM( nNextCh
) &&
512 if( IsParserWorking() && !rInput
.IsEof() )
514 OUString
sEntity(sEntityBuffer
.getStr(), nPos
);
515 cChar
= GetHTMLCharName( sEntity
);
517 // not found ( == 0 ): plain text
518 // or a character which is inserted as attribute
519 if( 0U == cChar
&& ';' != nNextCh
)
521 DBG_ASSERT( rInput
.Tell() - nStreamPos
==
522 (sal_uLong
)(nPos
+1L)*GetCharSize(),
523 "UTF-8 is failing here" );
524 for( xub_StrLen i
=nPos
-1L; i
>1L; i
-- )
526 nNextCh
= sEntityBuffer
[i
];
527 sEntityBuffer
.setLength( i
);
528 sEntity
= OUString(sEntityBuffer
.getStr(), i
);
529 cChar
= GetHTMLCharName( sEntity
);
532 rInput
.SeekRel( -(long)
533 ((nPos
-i
)*GetCharSize()) );
534 nlLinePos
-= sal_uInt32(nPos
-i
);
536 ClearTxtConvContext();
542 if( !cChar
) // unknown character?
544 // back in stream, insert '&'
545 // and restart with next character
546 sTmpBuffer
.append( (sal_Unicode
)'&' );
548 DBG_ASSERT( rInput
.Tell()-nStreamPos
==
549 (sal_uLong
)(nPos
+1)*GetCharSize(),
550 "Wrong stream position" );
551 DBG_ASSERT( nlLinePos
-nLinePos
==
553 "Wrong line position" );
554 rInput
.Seek( nStreamPos
);
555 nlLinePos
= nLinePos
;
556 ClearTxtConvContext();
560 // 1 == Non Breaking Space
567 // When reading the content of a tag we have
568 // to change it to ' ' or '-'
571 case 1U: cChar
= ' '; break;
572 case 2U: cChar
= '-'; break;
574 DBG_ASSERT( cChar
==1U,
575 "\0x00 should be handled already!" );
581 // If not scanning a tag return token
583 String( sTmpBuffer
.makeStringAndClear() );
588 // restart with character
590 DBG_ASSERT( rInput
.Tell()-nStreamPos
==
591 (sal_uLong
)(nPos
+1)*GetCharSize(),
592 "Wrong stream position" );
593 DBG_ASSERT( nlLinePos
-nLinePos
==
595 "Wrong line position" );
596 rInput
.Seek( nStreamPos
);
597 nlLinePos
= nLinePos
;
598 ClearTxtConvContext();
599 return HTML_TEXTTOKEN
;
602 // Hack: _GetNextChar shall not read the
607 return HTML_NONBREAKSPACE
;
609 return HTML_SOFTHYPH
;
611 aToken
+= (sal_Unicode
)'&';
613 String(sEntityBuffer
.makeStringAndClear());
621 // &{...};-JavaScript-Macros are not supported any longer.
622 else if( IsParserWorking() )
624 sTmpBuffer
.append( (sal_Unicode
)'&' );
629 bNextCh
= (';' == nNextCh
);
630 if( cBreak
=='>' && (cChar
=='\\' || cChar
=='\'' ||
631 cChar
=='\"' || cChar
==' ') )
633 // ' and " have to be escaped withing tags to separate
634 // them from ' and " enclosing options.
635 // \ has to be escaped as well.
636 // Space is protected because it's not a delimiter between
638 sTmpBuffer
.append( (sal_Unicode
)'\\' );
639 if( MAX_LEN
== sTmpBuffer
.getLength() )
640 aToken
+= String(sTmpBuffer
.makeStringAndClear());
642 if( IsParserWorking() )
645 sTmpBuffer
.append( cChar
);
647 else if( SVPAR_PENDING
==eState
&& '>'!=cBreak
)
649 // Restart with '&', the remainder is returned as
651 if( aToken
.Len() || sTmpBuffer
.getLength() )
653 // _GetNextChar() returns the previous text and
654 // during the next execution a new character is read.
655 // Thus we have to position in front of the '&'.
657 rInput
.Seek( nStreamPos
-(sal_uInt32
)GetCharSize() );
658 nlLinePos
= nLinePos
-1;
659 ClearTxtConvContext();
660 bReadNextChar
= true;
667 if( '>'==cBreak
&& !cQuote
)
669 sTmpBuffer
.append( nNextCh
);
675 // Innerhalb von Tags kennzeichnen
676 sTmpBuffer
.append( (sal_Unicode
)'\\' );
677 if( MAX_LEN
== sTmpBuffer
.getLength() )
678 aToken
+= String(sTmpBuffer
.makeStringAndClear());
680 sTmpBuffer
.append( (sal_Unicode
)'\\' );
689 else if( cQuote
&& (cQuote
==nNextCh
) )
692 sTmpBuffer
.append( nNextCh
);
693 bEqSignFound
= false;
696 case sal_Unicode(EOF
):
703 sTmpBuffer
.append( nNextCh
);
708 bEqSignFound
= false;
710 sTmpBuffer
.append( nNextCh
);
712 bContinue
= false; // break, String zusammen
718 // If scanning options treat it like a space, ...
719 sTmpBuffer
.append( (sal_Unicode
)' ' );
723 // otherwise it's a separate token.
732 // cr/lf in tag is handled in _GetNextToken()
733 sTmpBuffer
.append( nNextCh
);
736 else if( bReadListing
|| bReadXMP
|| bReadPRE
|| bReadTextArea
)
741 // Reduce sequence of CR/LF/BLANK/TAB to a single blank
744 if( '\t'==nNextCh
&& bReadPRE
&& '>'!=cBreak
)
746 // In <PRE>: Tabs nach oben durchreichen
752 if( '\x0b'==nNextCh
&& (bReadPRE
|| bReadXMP
||bReadListing
) &&
760 sTmpBuffer
.append( nNextCh
);
761 if( '>'!=cBreak
&& (!bReadListing
&& !bReadXMP
&&
762 !bReadPRE
&& !bReadTextArea
) )
764 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
766 if( sal_Unicode(EOF
) == (nNextCh
= GetNextChar()) &&
769 if( aToken
.Len() || sTmpBuffer
.getLength() > 1L )
771 // Have seen s.th. aside from blanks?
772 aToken
+= String(sTmpBuffer
.makeStringAndClear());
773 return HTML_TEXTTOKEN
;
776 // Only read blanks: no text must be returned
777 // and _GetNextToken has to read until EOF
780 } while ( ' ' == nNextCh
|| '\t' == nNextCh
||
781 '\r' == nNextCh
|| '\n' == nNextCh
||
788 bEqSignFound
= false;
789 if( (nNextCh
==cBreak
&& !cQuote
) ||
790 (sal_uLong(aToken
.Len()) + MAX_LEN
) > sal_uLong(STRING_MAXLEN
& ~1 ))
795 // All remaining characters make their way into the text.
796 sTmpBuffer
.append( nNextCh
);
797 if( MAX_LEN
== sTmpBuffer
.getLength() )
799 aToken
+= String(sTmpBuffer
.makeStringAndClear());
800 if( (sal_uLong(aToken
.Len()) + MAX_LEN
) >
801 sal_uLong(STRING_MAXLEN
& ~1 ) )
803 nNextCh
= GetNextChar();
804 return HTML_TEXTTOKEN
;
807 if( ( sal_Unicode(EOF
) == (nNextCh
= GetNextChar()) &&
811 if( sTmpBuffer
.getLength() )
812 aToken
+= String(sTmpBuffer
.makeStringAndClear());
813 return HTML_TEXTTOKEN
;
815 } while( HTML_ISALPHA( nNextCh
) || HTML_ISDIGIT( nNextCh
) );
820 if( MAX_LEN
== sTmpBuffer
.getLength() )
821 aToken
+= String(sTmpBuffer
.makeStringAndClear());
823 if( bContinue
&& bNextCh
)
824 nNextCh
= GetNextChar();
827 if( sTmpBuffer
.getLength() )
828 aToken
+= String(sTmpBuffer
.makeStringAndClear());
830 return HTML_TEXTTOKEN
;
833 int HTMLParser::_GetNextRawToken()
835 OUStringBuffer
sTmpBuffer( MAX_LEN
);
839 // During the last execution we already found the end token,
840 // thus we don't have to search it again.
844 bEndTokenFound
= false;
849 // Default return value: HTML_RAWDATA
850 int bContinue
= true;
851 int nToken
= HTML_RAWDATA
;
853 while( bContinue
&& IsParserWorking() )
860 // Maybe we've reached the end.
862 // Save what we have read previously...
863 aToken
+= String(sTmpBuffer
.makeStringAndClear());
865 // and remember position in stream.
866 sal_uLong nStreamPos
= rInput
.Tell();
867 sal_uLong nLineNr
= GetLineNr();
868 sal_uLong nLinePos
= GetLinePos();
870 // Start of an end token?
871 int bOffState
= false;
872 if( '/' == (nNextCh
= GetNextChar()) )
875 nNextCh
= GetNextChar();
877 else if( '!' == nNextCh
)
879 sTmpBuffer
.append( nNextCh
);
880 nNextCh
= GetNextChar();
883 // Read following letters
884 while( (HTML_ISALPHA(nNextCh
) || '-'==nNextCh
) &&
885 IsParserWorking() && sTmpBuffer
.getLength() < MAX_LEN
)
887 sTmpBuffer
.append( nNextCh
);
888 nNextCh
= GetNextChar();
891 String
aTok( sTmpBuffer
.toString() );
894 if( bReadScript
|| aEndToken
.Len() )
898 if( aTok
.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment
, 3 )
905 // A script has to end with "</SCRIPT>". But
906 // ">" is optional for security reasons
908 COMPARE_EQUAL
== ( bReadScript
909 ? aTok
.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script
)
910 : aTok
.CompareTo(aEndToken
) );
913 if( bReadComment
&& '>'==nNextCh
&& aTok
.Len() >= 2 &&
914 aTok
.Copy( aTok
.Len()-2 ).EqualsAscii( "--" ) )
916 // End of comment of style <!----->
917 bReadComment
= false;
922 // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
924 bDone
= aTok
.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style
)
926 aTok
.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head
)
930 aTok
.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body
) == COMPARE_EQUAL
;
935 // Done! Return the previously read string (if requested)
940 // nToken==0 means, _GetNextToken continues to read
941 if( !aToken
.Len() && (bReadStyle
|| bReadScript
) )
943 // Immediately close environment (or context?)
944 // and parse the end token
952 // Keep bReadScript/bReadStyle alive
953 // and parse end token during next execution
954 bEndTokenFound
= true;
957 // Move backwards in stream to '<'
958 rInput
.Seek( nStreamPos
);
959 SetLineNr( nLineNr
);
960 SetLinePos( nLinePos
);
961 ClearTxtConvContext();
964 // Don't append string to token.
965 sTmpBuffer
.setLength( 0L );
969 // remember "</" , everything else we find in the buffer
970 aToken
+= (sal_Unicode
)'<';
972 aToken
+= (sal_Unicode
)'/';
979 sTmpBuffer
.append( nNextCh
);
982 bool bTwoMinus
= false;
983 nNextCh
= GetNextChar();
984 while( '-' == nNextCh
&& IsParserWorking() )
988 if( MAX_LEN
== sTmpBuffer
.getLength() )
989 aToken
+= String(sTmpBuffer
.makeStringAndClear());
990 sTmpBuffer
.append( nNextCh
);
991 nNextCh
= GetNextChar();
994 if( '>' == nNextCh
&& IsParserWorking() && bTwoMinus
)
995 bReadComment
= false;
1002 // \r\n? closes the current text token (even if it's empty)
1003 nNextCh
= GetNextChar();
1005 nNextCh
= GetNextChar();
1009 // \n closes the current text token (even if it's empty)
1010 nNextCh
= GetNextChar();
1013 case sal_Unicode(EOF
):
1014 // eof closes the current text token and behaves like having read
1016 if( rInput
.IsEof() )
1019 if( aToken
.Len() || sTmpBuffer
.getLength() )
1021 bEndTokenFound
= true;
1025 bReadScript
= false;
1034 // all remaining characters are appended to the buffer
1035 sTmpBuffer
.append( nNextCh
);
1039 if( (!bContinue
&& sTmpBuffer
.getLength() > 0L) ||
1040 MAX_LEN
== sTmpBuffer
.getLength() )
1041 aToken
+= String(sTmpBuffer
.makeStringAndClear());
1043 if( bContinue
&& bNextCh
)
1044 nNextCh
= GetNextChar();
1047 if( IsParserWorking() )
1056 int HTMLParser::_GetNextToken()
1061 if (mnPendingOffToken
)
1063 // HTML_<TOKEN>_OFF generated for HTML_<TOKEN>_ON
1064 nRet
= mnPendingOffToken
;
1065 mnPendingOffToken
= 0;
1071 if (!maOptions
.empty())
1074 if( !IsParserWorking() ) // Don't continue if already an error occurred
1077 bool bReadNextCharSave
= bReadNextChar
;
1080 DBG_ASSERT( !bEndTokenFound
,
1081 "Read a character despite </SCRIPT> was read?" );
1082 nNextCh
= GetNextChar();
1083 if( !IsParserWorking() ) // Don't continue if already an error occurred
1085 bReadNextChar
= false;
1088 if( bReadScript
|| bReadStyle
|| aEndToken
.Len() )
1090 nRet
= _GetNextRawToken();
1091 if( nRet
|| !IsParserWorking() )
1101 sal_uLong nStreamPos
= rInput
.Tell();
1102 sal_uLong nLineNr
= GetLineNr();
1103 sal_uLong nLinePos
= GetLinePos();
1105 int bOffState
= false;
1106 if( '/' == (nNextCh
= GetNextChar()) )
1109 nNextCh
= GetNextChar();
1111 if( HTML_ISALPHA( nNextCh
) || '!'==nNextCh
)
1113 OUStringBuffer sTmpBuffer
;
1115 sTmpBuffer
.append( nNextCh
);
1116 if( MAX_LEN
== sTmpBuffer
.getLength() )
1117 aToken
+= String(sTmpBuffer
.makeStringAndClear());
1118 nNextCh
= GetNextChar();
1119 } while( '>' != nNextCh
&& '/' != nNextCh
&& !HTML_ISSPACE( nNextCh
) &&
1120 IsParserWorking() && !rInput
.IsEof() );
1122 if( sTmpBuffer
.getLength() )
1123 aToken
+= String(sTmpBuffer
.makeStringAndClear());
1126 while( HTML_ISSPACE( nNextCh
) && IsParserWorking() )
1127 nNextCh
= GetNextChar();
1129 if( !IsParserWorking() )
1131 if( SVPAR_PENDING
== eState
)
1132 bReadNextChar
= bReadNextCharSave
;
1136 // Search token in table:
1137 sSaveToken
= aToken
;
1138 aToken
.ToUpperAscii();
1139 if( 0 == (nRet
= GetHTMLToken( aToken
)) )
1141 nRet
= HTML_UNKNOWNCONTROL_ON
;
1143 // If it's a token which can be switched off...
1146 if( HTML_TOKEN_ONOFF
& nRet
)
1148 // and there is an off token, return off token instead
1151 else if( HTML_LINEBREAK
!=nRet
)
1153 // and there is no off token, return unknown token.
1154 // (except for </BR>, that is treated like <BR>)
1155 nRet
= HTML_UNKNOWNCONTROL_OFF
;
1159 if( nRet
== HTML_COMMENT
)
1161 // fix: due to being case sensitive use sSaveToken as start of comment
1162 // and append a blank.
1163 aToken
= sSaveToken
;
1165 aToken
+= (sal_Unicode
)' ';
1166 sal_uLong nCStreamPos
= 0;
1167 sal_uLong nCLineNr
= 0;
1168 sal_uLong nCLinePos
= 0;
1169 xub_StrLen nCStrLen
= 0;
1172 // Read until closing -->. If not found restart at first >
1173 while( !bDone
&& !rInput
.IsEof() && IsParserWorking() )
1179 nCStreamPos
= rInput
.Tell();
1180 nCStrLen
= aToken
.Len();
1181 nCLineNr
= GetLineNr();
1182 nCLinePos
= GetLinePos();
1184 bDone
= aToken
.Len() >= 2 &&
1185 aToken
.Copy(aToken
.Len()-2,2).
1186 EqualsAscii( "--" );
1193 nNextCh
= GetNextChar();
1195 if( !bDone
&& IsParserWorking() && nCStreamPos
)
1197 rInput
.Seek( nCStreamPos
);
1198 SetLineNr( nCLineNr
);
1199 SetLinePos( nCLinePos
);
1200 ClearTxtConvContext();
1201 aToken
.Erase( nCStrLen
);
1207 // TokenString not needed anymore
1211 // Read until closing '>'
1212 if( '>' != nNextCh
&& IsParserWorking() )
1216 // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1217 // generate pending HTML_<TOKEN>_OFF for HTML_<TOKEN>_ON
1218 // Do not convert this to a single HTML_<TOKEN>_OFF
1219 // which lead to fdo#56772.
1220 if ((HTML_TOKEN_ONOFF
& nRet
) && (aToken
.Len() >= 1) &&
1221 ('/' == aToken
.GetChar(aToken
.Len()-1)))
1223 mnPendingOffToken
= nRet
+ 1; // HTML_<TOKEN>_ON -> HTML_<TOKEN>_OFF
1224 aToken
.Erase( aToken
.Len()-1, 1); // remove trailing '/'
1226 if( sal_Unicode(EOF
) == nNextCh
&& rInput
.IsEof() )
1228 // Move back in front of < and restart there.
1229 // Return < as text.
1230 rInput
.Seek( nStreamPos
);
1231 SetLineNr( nLineNr
);
1232 SetLinePos( nLinePos
);
1233 ClearTxtConvContext();
1236 nRet
= HTML_TEXTTOKEN
;
1237 nNextCh
= GetNextChar();
1242 if( SVPAR_PENDING
== eState
)
1243 bReadNextChar
= bReadNextCharSave
;
1249 // einfach alles wegschmeissen
1251 if( sal_Unicode(EOF
) == nNextCh
&& rInput
.IsEof() )
1253 // Move back in front of < and restart there.
1254 // Return < as text.
1255 rInput
.Seek( nStreamPos
);
1256 SetLineNr( nLineNr
);
1257 SetLinePos( nLinePos
);
1258 ClearTxtConvContext();
1261 nRet
= HTML_TEXTTOKEN
;
1262 nNextCh
= GetNextChar();
1266 if( SVPAR_PENDING
== eState
)
1267 bReadNextChar
= bReadNextCharSave
;
1270 else if( '%' == nNextCh
)
1272 nRet
= HTML_UNKNOWNCONTROL_ON
;
1274 sal_uLong nCStreamPos
= rInput
.Tell();
1275 sal_uLong nCLineNr
= GetLineNr(), nCLinePos
= GetLinePos();
1278 // Read until closing %>. If not found restart at first >.
1279 while( !bDone
&& !rInput
.IsEof() && IsParserWorking() )
1281 bDone
= '>'==nNextCh
&& aToken
.Len() >= 1 &&
1282 '%' == aToken
.GetChar( aToken
.Len()-1 );
1286 nNextCh
= GetNextChar();
1289 if( !bDone
&& IsParserWorking() )
1291 rInput
.Seek( nCStreamPos
);
1292 SetLineNr( nCLineNr
);
1293 SetLinePos( nCLinePos
);
1294 ClearTxtConvContext();
1295 aToken
.AssignAscii( "<%", 2 );
1296 nRet
= HTML_TEXTTOKEN
;
1299 if( IsParserWorking() )
1301 sSaveToken
= aToken
;
1308 nRet
= HTML_TEXTTOKEN
;
1314 if( IsParserWorking() )
1316 bNextCh
= '>' == nNextCh
;
1319 case HTML_TEXTAREA_ON
:
1320 bReadTextArea
= true;
1322 case HTML_TEXTAREA_OFF
:
1323 bReadTextArea
= false;
1325 case HTML_SCRIPT_ON
:
1326 if( !bReadTextArea
)
1329 case HTML_SCRIPT_OFF
:
1330 if( !bReadTextArea
)
1332 bReadScript
= false;
1333 // JavaScript might modify the stream,
1334 // thus the last character has to be read again.
1335 bReadNextChar
= true;
1343 case HTML_STYLE_OFF
:
1351 case sal_Unicode(EOF
):
1352 if( rInput
.IsEof() )
1354 eState
= SVPAR_ACCEPTED
;
1359 // Read normal text.
1365 // form feeds are passed upwards separately
1366 nRet
= HTML_LINEFEEDCHAR
; // !!! should be FORMFEEDCHAR
1371 if( bReadListing
|| bReadXMP
|| bReadPRE
|| bReadTextArea
)
1373 sal_Unicode c
= GetNextChar();
1374 if( ( '\n' != nNextCh
|| '\r' != c
) &&
1375 ( '\r' != nNextCh
|| '\n' != c
) )
1380 nRet
= HTML_NEWPARA
;
1387 nRet
= HTML_TABCHAR
;
1396 // "normal" text to come
1398 bNextCh
= 0 == aToken
.Len();
1400 // the text should be processed
1401 if( !bNextCh
&& eState
== SVPAR_PENDING
)
1403 eState
= SVPAR_WORKING
;
1404 bReadNextChar
= true;
1410 if( bNextCh
&& SVPAR_WORKING
== eState
)
1412 nNextCh
= GetNextChar();
1413 if( SVPAR_PENDING
== eState
&& nRet
&& HTML_TEXTTOKEN
!= nRet
)
1415 bReadNextChar
= true;
1416 eState
= SVPAR_WORKING
;
1420 } while( !nRet
&& SVPAR_WORKING
== eState
);
1422 if( SVPAR_PENDING
== eState
)
1423 nRet
= -1; // s.th. invalid
1428 void HTMLParser::UnescapeToken()
1432 bool bEscape
= false;
1433 while( nPos
< aToken
.Len() )
1435 bool bOldEscape
= bEscape
;
1437 if( '\\'==aToken
.GetChar(nPos
) && !bOldEscape
)
1439 aToken
.Erase( nPos
, 1 );
1449 const HTMLOptions
& HTMLParser::GetOptions( sal_uInt16
*pNoConvertToken
) const
1451 // If the options for the current token have already been returned,
1452 // return them once again.
1453 if (!maOptions
.empty())
1456 xub_StrLen nPos
= 0;
1457 while( nPos
< aToken
.Len() )
1459 // A letter? Option beginning here.
1460 if( HTML_ISALPHA( aToken
.GetChar(nPos
) ) )
1464 xub_StrLen nStt
= nPos
;
1465 sal_Unicode cChar
= 0;
1467 // Actually only certain characters allowed.
1468 // Netscape only looks for "=" and white space (c.f.
1469 // Mozilla: PA_FetchRequestedNameValues in lipparse/pa_mdl.c)
1470 while( nPos
< aToken
.Len() && '=' != (cChar
=aToken
.GetChar(nPos
)) &&
1471 HTML_ISPRINTABLE(cChar
) && !HTML_ISSPACE(cChar
) )
1474 String
sName( aToken
.Copy( nStt
, nPos
-nStt
) );
1476 // PlugIns require original token name. Convert to upper case only for searching.
1477 String
sNameUpperCase( sName
);
1478 sNameUpperCase
.ToUpperAscii();
1480 nToken
= GetHTMLOption( sNameUpperCase
); // Name is ready
1481 DBG_ASSERTWARNING( nToken
!=HTML_O_UNKNOWN
,
1482 "GetOption: unknown HTML option" );
1483 bool bStripCRLF
= (nToken
< HTML_OPTION_SCRIPT_START
||
1484 nToken
>= HTML_OPTION_SCRIPT_END
) &&
1485 (!pNoConvertToken
|| nToken
!= *pNoConvertToken
);
1487 while( nPos
< aToken
.Len() &&
1488 ( !HTML_ISPRINTABLE( (cChar
=aToken
.GetChar(nPos
)) ) ||
1489 HTML_ISSPACE(cChar
) ) )
1492 // Option with value?
1493 if( nPos
!=aToken
.Len() && '='==cChar
)
1497 while( nPos
< aToken
.Len() &&
1498 ( !HTML_ISPRINTABLE( (cChar
=aToken
.GetChar(nPos
)) ) ||
1499 ' '==cChar
|| '\t'==cChar
|| '\r'==cChar
|| '\n'==cChar
) )
1502 if( nPos
!= aToken
.Len() )
1504 xub_StrLen nLen
= 0;
1506 if( ('"'==cChar
) || ('\'')==cChar
)
1508 sal_Unicode cEnd
= cChar
;
1511 bool bEscape
= false;
1512 while( nPos
< aToken
.Len() && !bDone
)
1514 bool bOldEscape
= bEscape
;
1516 cChar
= aToken
.GetChar(nPos
);
1522 ((String
&)aToken
).Erase( nPos
, 1 );
1533 ((String
&)aToken
).Erase( nPos
, 1 );
1539 bDone
= !bOldEscape
&& cChar
==cEnd
;
1548 if( nPos
!=aToken
.Len() )
1553 // More liberal than the standard: allow all printable characters
1554 bool bEscape
= false;
1556 while( nPos
< aToken
.Len() && !bDone
)
1558 bool bOldEscape
= bEscape
;
1560 sal_Unicode c
= aToken
.GetChar(nPos
);
1564 bDone
= !bOldEscape
;
1582 ((String
&)aToken
).Erase( nPos
, 1 );
1588 if( HTML_ISPRINTABLE( c
) )
1598 aValue
= aToken
.Copy( nStt
, nLen
);
1602 // Token is known and can be saved
1603 std::auto_ptr
<HTMLOption
> pOption(
1604 new HTMLOption(sal::static_int_cast
<sal_uInt16
>(nToken
), sName
, aValue
));
1606 maOptions
.push_back(pOption
);
1609 // Ignore white space and unexpected characters
1616 int HTMLParser::FilterPRE( int nToken
)
1620 #ifdef HTML_BEHAVIOUR
1621 // These become LFs according to the definition
1622 case HTML_PARABREAK_ON
:
1623 case HTML_LINEBREAK
:
1624 nToken
= HTML_NEWPARA
;
1626 // in Netscape they only have impact in not empty paragraphs
1627 case HTML_PARABREAK_ON
:
1628 nToken
= HTML_LINEBREAK
;
1629 case HTML_LINEBREAK
:
1633 if( bPre_IgnoreNewPara
)
1639 sal_Int32 nSpaces
= (8 - (nPre_LinePos
% 8));
1640 DBG_ASSERT( !aToken
.Len(), "Why is the token not empty?" );
1641 if (aToken
.Len() < nSpaces
)
1643 using comphelper::string::padToLength
;
1644 OUStringBuffer
aBuf(aToken
);
1645 aToken
= padToLength(aBuf
, nSpaces
, ' ').makeStringAndClear();
1647 nPre_LinePos
+= nSpaces
;
1648 nToken
= HTML_TEXTTOKEN
;
1652 case HTML_TEXTTOKEN
:
1653 nPre_LinePos
+= aToken
.Len();
1656 case HTML_SELECT_ON
:
1657 case HTML_SELECT_OFF
:
1663 case HTML_TEXTAREA_ON
:
1664 case HTML_TEXTAREA_OFF
:
1667 case HTML_APPLET_ON
:
1668 case HTML_APPLET_OFF
:
1673 case HTML_HEAD1_OFF
:
1675 case HTML_HEAD2_OFF
:
1677 case HTML_HEAD3_OFF
:
1679 case HTML_HEAD4_OFF
:
1681 case HTML_HEAD5_OFF
:
1683 case HTML_HEAD6_OFF
:
1684 case HTML_BLOCKQUOTE_ON
:
1685 case HTML_BLOCKQUOTE_OFF
:
1686 case HTML_ADDRESS_ON
:
1687 case HTML_ADDRESS_OFF
:
1690 case HTML_CENTER_ON
:
1691 case HTML_CENTER_OFF
:
1692 case HTML_DIVISION_ON
:
1693 case HTML_DIVISION_OFF
:
1695 case HTML_SCRIPT_ON
:
1696 case HTML_SCRIPT_OFF
:
1700 case HTML_TABLE_OFF
:
1701 case HTML_CAPTION_ON
:
1702 case HTML_CAPTION_OFF
:
1703 case HTML_COLGROUP_ON
:
1704 case HTML_COLGROUP_OFF
:
1708 case HTML_THEAD_OFF
:
1710 case HTML_TFOOT_OFF
:
1712 case HTML_TBODY_OFF
:
1713 case HTML_TABLEROW_ON
:
1714 case HTML_TABLEROW_OFF
:
1715 case HTML_TABLEDATA_ON
:
1716 case HTML_TABLEDATA_OFF
:
1717 case HTML_TABLEHEADER_ON
:
1718 case HTML_TABLEHEADER_OFF
:
1720 case HTML_ANCHOR_ON
:
1721 case HTML_ANCHOR_OFF
:
1724 case HTML_ITALIC_ON
:
1725 case HTML_ITALIC_OFF
:
1726 case HTML_STRIKE_ON
:
1727 case HTML_STRIKE_OFF
:
1728 case HTML_STRIKETHROUGH_ON
:
1729 case HTML_STRIKETHROUGH_OFF
:
1730 case HTML_UNDERLINE_ON
:
1731 case HTML_UNDERLINE_OFF
:
1732 case HTML_BASEFONT_ON
:
1733 case HTML_BASEFONT_OFF
:
1737 case HTML_BLINK_OFF
:
1740 case HTML_SUBSCRIPT_ON
:
1741 case HTML_SUBSCRIPT_OFF
:
1742 case HTML_SUPERSCRIPT_ON
:
1743 case HTML_SUPERSCRIPT_OFF
:
1744 case HTML_BIGPRINT_ON
:
1745 case HTML_BIGPRINT_OFF
:
1746 case HTML_SMALLPRINT_OFF
:
1747 case HTML_SMALLPRINT_ON
:
1749 case HTML_EMPHASIS_ON
:
1750 case HTML_EMPHASIS_OFF
:
1751 case HTML_CITIATION_ON
:
1752 case HTML_CITIATION_OFF
:
1753 case HTML_STRONG_ON
:
1754 case HTML_STRONG_OFF
:
1757 case HTML_SAMPLE_ON
:
1758 case HTML_SAMPLE_OFF
:
1759 case HTML_KEYBOARD_ON
:
1760 case HTML_KEYBOARD_OFF
:
1761 case HTML_VARIABLE_ON
:
1762 case HTML_VARIABLE_OFF
:
1763 case HTML_DEFINSTANCE_ON
:
1764 case HTML_DEFINSTANCE_OFF
:
1765 case HTML_SHORTQUOTE_ON
:
1766 case HTML_SHORTQUOTE_OFF
:
1767 case HTML_LANGUAGE_ON
:
1768 case HTML_LANGUAGE_OFF
:
1769 case HTML_AUTHOR_ON
:
1770 case HTML_AUTHOR_OFF
:
1771 case HTML_PERSON_ON
:
1772 case HTML_PERSON_OFF
:
1773 case HTML_ACRONYM_ON
:
1774 case HTML_ACRONYM_OFF
:
1775 case HTML_ABBREVIATION_ON
:
1776 case HTML_ABBREVIATION_OFF
:
1777 case HTML_INSERTEDTEXT_ON
:
1778 case HTML_INSERTEDTEXT_OFF
:
1779 case HTML_DELETEDTEXT_ON
:
1780 case HTML_DELETEDTEXT_OFF
:
1781 case HTML_TELETYPE_ON
:
1782 case HTML_TELETYPE_OFF
:
1786 // The remainder is treated as an unknown token.
1791 ( ((HTML_TOKEN_ONOFF
& nToken
) && (1 & nToken
))
1792 ? HTML_UNKNOWNCONTROL_OFF
1793 : HTML_UNKNOWNCONTROL_ON
);
1798 bPre_IgnoreNewPara
= false;
1803 int HTMLParser::FilterXMP( int nToken
)
1808 if( bPre_IgnoreNewPara
)
1810 case HTML_TEXTTOKEN
:
1811 case HTML_NONBREAKSPACE
:
1818 if( (HTML_TOKEN_ONOFF
& nToken
) && (1 & nToken
) )
1820 sSaveToken
.Insert( '<', 0 );
1821 sSaveToken
.Insert( '/', 1 );
1824 sSaveToken
.Insert( '<', 0 );
1828 sSaveToken
+= (sal_Unicode
)' ';
1829 aToken
.Insert( sSaveToken
, 0 );
1832 aToken
= sSaveToken
;
1833 aToken
+= (sal_Unicode
)'>';
1834 nToken
= HTML_TEXTTOKEN
;
1839 bPre_IgnoreNewPara
= false;
1844 int HTMLParser::FilterListing( int nToken
)
1849 if( bPre_IgnoreNewPara
)
1851 case HTML_TEXTTOKEN
:
1852 case HTML_NONBREAKSPACE
:
1860 ( ((HTML_TOKEN_ONOFF
& nToken
) && (1 & nToken
))
1861 ? HTML_UNKNOWNCONTROL_OFF
1862 : HTML_UNKNOWNCONTROL_ON
);
1867 bPre_IgnoreNewPara
= false;
1872 bool HTMLParser::IsHTMLFormat( const sal_Char
* pHeader
,
1874 rtl_TextEncoding eEnc
)
1876 // If the string matches one of the following regular expressions then
1877 // the document is a HTML document.
1879 // ^[^<]*<[^ \t]*[> \t]
1883 // where the underlined subexpression has to be a HTML token
1885 bool bUCS2B
= false;
1888 if( 0xfeU
== (sal_uChar
)pHeader
[0] &&
1889 0xffU
== (sal_uChar
)pHeader
[1] )
1891 eEnc
= RTL_TEXTENCODING_UCS2
;
1894 else if( 0xffU
== (sal_uChar
)pHeader
[0] &&
1895 0xfeU
== (sal_uChar
)pHeader
[1] )
1897 eEnc
= RTL_TEXTENCODING_UCS2
;
1902 RTL_TEXTENCODING_UCS2
== eEnc
&&
1904 (0xfe == (sal_uChar
)pHeader
[0] && 0xff == (sal_uChar
)pHeader
[1]) ||
1905 (0xff == (sal_uChar
)pHeader
[0] && 0xfe == (sal_uChar
)pHeader
[1])
1909 if( 0xfe == (sal_uChar
)pHeader
[0] )
1914 pHeader
[nLen
] != 0 || pHeader
[nLen
+1] != 0;
1918 OStringBuffer
sTmp( (nLen
- 2)/2 );
1919 for( xub_StrLen nPos
= 2; nPos
< nLen
; nPos
+= 2 )
1923 cUC
= (sal_Unicode(pHeader
[nPos
]) << 8) | pHeader
[nPos
+1];
1925 cUC
= (sal_Unicode(pHeader
[nPos
+1]) << 8) | pHeader
[nPos
];
1929 sTmp
.append( cUC
< 256U ? (sal_Char
)cUC
: '.' );
1931 sCmp
= sTmp
.makeStringAndClear();
1938 sCmp
= sCmp
.toAsciiUpperCase();
1940 // A HTML document must have a '<' in the first line
1941 sal_Int32 nStart
= sCmp
.indexOf('<');
1946 // followed by arbitrary characters followed by a blank or '>'
1949 for( nPos
= nStart
; nPos
< sCmp
.getLength(); ++nPos
)
1951 if( '>'==(c
=sCmp
[nPos
]) || HTML_ISSPACE(c
) )
1955 // If the document ends after < it's no HTML
1959 // the string following '<' has to be a known HTML token.
1960 // <DIR> is not interpreted as HTML. Otherwise the output of the DOS command "DIR"
1961 // could be interpreted as HTML.
1962 OUString
sTest(OStringToOUString(sCmp
.copy(nStart
, nPos
-nStart
), RTL_TEXTENCODING_ASCII_US
));
1963 int nTok
= GetHTMLToken( sTest
);
1964 if( 0 != nTok
&& HTML_DIRLIST_ON
!= nTok
)
1967 // "<!" at the very beginning of the file?
1968 if( nStart
== 1 && '!' == sCmp
[1] )
1971 // <HTML> somewhere in the first 80 characters of the document
1972 nStart
= sCmp
.indexOfL(RTL_CONSTASCII_STRINGPARAM(OOO_STRING_SVTOOLS_HTML_html
));
1973 if( nStart
>0 && '<'==sCmp
[nStart
-1] &&
1974 nStart
+4 < sCmp
.getLength() && '>'==sCmp
[nStart
+4] )
1977 // Else it's rather not a HTML document
1981 bool HTMLParser::InternalImgToPrivateURL( String
& rURL
)
1983 if( rURL
.Len() < 19 || 'i' != rURL
.GetChar(0) ||
1984 rURL
.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher
, 9 ) != COMPARE_EQUAL
)
1987 bool bFound
= false;
1989 if( rURL
.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher
,16) == COMPARE_EQUAL
)
1991 String
aName( rURL
.Copy(16) );
1992 switch( aName
.GetChar(0) )
1995 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary
);
1998 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image
) ||
1999 aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index
);
2002 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu
) ||
2003 aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie
);
2006 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound
);
2009 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet
) ||
2010 aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text
);
2013 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown
);
2017 else if( rURL
.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon
,14) == COMPARE_EQUAL
)
2019 String
aName( rURL
.Copy(14) );
2020 switch( aName
.GetChar(0) )
2023 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata
);
2026 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed
);
2029 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed
);
2032 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure
);
2035 bFound
= aName
.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound
);
2041 String
sTmp ( rURL
);
2042 rURL
.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image
);
2043 rURL
.Append( sTmp
);
2052 HTML_META_DESCRIPTION
,
2055 HTML_META_CLASSIFICATION
,
2057 HTML_META_CHANGEDBY
,
2059 HTML_META_GENERATOR
,
2060 HTML_META_SDFOOTNOTE
,
2061 HTML_META_SDENDNOTE
,
2062 HTML_META_CONTENT_TYPE
2066 static HTMLOptionEnum
const aHTMLMetaNameTable
[] =
2068 { OOO_STRING_SVTOOLS_HTML_META_author
, HTML_META_AUTHOR
},
2069 { OOO_STRING_SVTOOLS_HTML_META_changed
, HTML_META_CHANGED
},
2070 { OOO_STRING_SVTOOLS_HTML_META_changedby
, HTML_META_CHANGEDBY
},
2071 { OOO_STRING_SVTOOLS_HTML_META_classification
,HTML_META_CLASSIFICATION
},
2072 { OOO_STRING_SVTOOLS_HTML_META_content_type
, HTML_META_CONTENT_TYPE
},
2073 { OOO_STRING_SVTOOLS_HTML_META_created
, HTML_META_CREATED
},
2074 { OOO_STRING_SVTOOLS_HTML_META_description
, HTML_META_DESCRIPTION
},
2075 { OOO_STRING_SVTOOLS_HTML_META_keywords
, HTML_META_KEYWORDS
},
2076 { OOO_STRING_SVTOOLS_HTML_META_generator
, HTML_META_GENERATOR
},
2077 { OOO_STRING_SVTOOLS_HTML_META_refresh
, HTML_META_REFRESH
},
2078 { OOO_STRING_SVTOOLS_HTML_META_sdendnote
, HTML_META_SDENDNOTE
},
2079 { OOO_STRING_SVTOOLS_HTML_META_sdfootnote
, HTML_META_SDFOOTNOTE
},
2084 void HTMLParser::AddMetaUserDefined( OUString
const & )
2088 bool HTMLParser::ParseMetaOptionsImpl(
2089 const uno::Reference
<document::XDocumentProperties
> & i_xDocProps
,
2090 SvKeyValueIterator
*i_pHTTPHeader
,
2091 const HTMLOptions
& aOptions
,
2092 rtl_TextEncoding
& o_rEnc
)
2094 String aName
, aContent
;
2095 sal_uInt16 nAction
= HTML_META_NONE
;
2096 bool bHTTPEquiv
= false, bChanged
= false;
2098 for ( size_t i
= aOptions
.size(); i
; )
2100 const HTMLOption
& aOption
= aOptions
[--i
];
2101 switch ( aOption
.GetToken() )
2104 aName
= aOption
.GetString();
2105 if ( HTML_META_NONE
==nAction
)
2107 aOption
.GetEnum( nAction
, aHTMLMetaNameTable
);
2110 case HTML_O_HTTPEQUIV
:
2111 aName
= aOption
.GetString();
2112 aOption
.GetEnum( nAction
, aHTMLMetaNameTable
);
2115 case HTML_O_CONTENT
:
2116 aContent
= aOption
.GetString();
2121 if ( bHTTPEquiv
|| HTML_META_DESCRIPTION
!= nAction
)
2123 // if it is not a Description, remove CRs and LFs from CONTENT
2124 aContent
= comphelper::string::remove(aContent
, '\r');
2125 aContent
= comphelper::string::remove(aContent
, '\n');
2129 // convert line endings for Description
2130 aContent
= convertLineEnd(aContent
, GetSystemLineEnd());
2134 if ( bHTTPEquiv
&& i_pHTTPHeader
)
2136 // Netscape seems to just ignore a closing ", so we do too
2137 if ( aContent
.Len() && '"' == aContent
.GetChar( aContent
.Len()-1 ) )
2139 aContent
.Erase( aContent
.Len() - 1 );
2141 SvKeyValue
aKeyValue( aName
, aContent
);
2142 i_pHTTPHeader
->Append( aKeyValue
);
2147 case HTML_META_AUTHOR
:
2148 if (i_xDocProps
.is()) {
2149 i_xDocProps
->setAuthor( aContent
);
2153 case HTML_META_DESCRIPTION
:
2154 if (i_xDocProps
.is()) {
2155 i_xDocProps
->setDescription( aContent
);
2159 case HTML_META_KEYWORDS
:
2160 if (i_xDocProps
.is()) {
2161 i_xDocProps
->setKeywords(
2162 ::comphelper::string::convertCommaSeparated(aContent
));
2166 case HTML_META_CLASSIFICATION
:
2167 if (i_xDocProps
.is()) {
2168 i_xDocProps
->setSubject( aContent
);
2173 case HTML_META_CHANGEDBY
:
2174 if (i_xDocProps
.is()) {
2175 i_xDocProps
->setModifiedBy( aContent
);
2179 case HTML_META_CREATED
:
2180 case HTML_META_CHANGED
:
2181 if ( i_xDocProps
.is() && aContent
.Len() &&
2182 comphelper::string::getTokenCount(aContent
, ';') == 2 )
2184 Date
aDate( (sal_uLong
)aContent
.GetToken(0).ToInt32() );
2185 Time
aTime( (sal_uLong
)aContent
.GetToken(1).ToInt32() );
2186 DateTime
aDateTime( aDate
, aTime
);
2187 ::util::DateTime
uDT(aDateTime
.GetNanoSec(),
2188 aDateTime
.GetSec(), aDateTime
.GetMin(),
2189 aDateTime
.GetHour(), aDateTime
.GetDay(),
2190 aDateTime
.GetMonth(), aDateTime
.GetYear(),
2192 if ( HTML_META_CREATED
==nAction
)
2193 i_xDocProps
->setCreationDate( uDT
);
2195 i_xDocProps
->setModificationDate( uDT
);
2200 case HTML_META_REFRESH
:
2201 DBG_ASSERT( !bHTTPEquiv
|| i_pHTTPHeader
,
2202 "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" );
2205 case HTML_META_CONTENT_TYPE
:
2206 if ( aContent
.Len() )
2208 o_rEnc
= GetEncodingByMIME( aContent
);
2212 case HTML_META_NONE
:
2215 if (i_xDocProps
.is())
2217 uno::Reference
<beans::XPropertyContainer
> xUDProps
2218 = i_xDocProps
->getUserDefinedProperties();
2220 xUDProps
->addProperty(aName
,
2221 beans::PropertyAttribute::REMOVABLE
,
2222 uno::makeAny(OUString(aContent
)));
2223 AddMetaUserDefined(aName
);
2225 } catch (uno::Exception
&) {
2238 bool HTMLParser::ParseMetaOptions(
2239 const uno::Reference
<document::XDocumentProperties
> & i_xDocProps
,
2240 SvKeyValueIterator
*i_pHeader
)
2242 sal_uInt16 nContentOption
= HTML_O_CONTENT
;
2243 rtl_TextEncoding eEnc
= RTL_TEXTENCODING_DONTKNOW
;
2245 bool bRet
= ParseMetaOptionsImpl( i_xDocProps
, i_pHeader
,
2246 GetOptions(&nContentOption
),
2249 // If the encoding is set by a META tag, it may only overwrite the
2250 // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2251 // encodings. Everything else cannot lead to reasonable results.
2252 if (RTL_TEXTENCODING_DONTKNOW
!= eEnc
&&
2253 rtl_isOctetTextEncoding( eEnc
) &&
2254 rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2256 eEnc
= GetExtendedCompatibilityTextEncoding( eEnc
);
2257 SetSrcEncoding( eEnc
);
2263 rtl_TextEncoding
HTMLParser::GetEncodingByMIME( const String
& rMime
)
2267 INetContentTypeParameterList aParameters
;
2268 if (INetContentTypes::parse(rMime
, sType
, sSubType
, &aParameters
))
2270 const INetContentTypeParameter
* pCharset
= aParameters
.find("charset");
2273 OString
sValue(OUStringToOString(pCharset
->m_sValue
, RTL_TEXTENCODING_ASCII_US
));
2274 return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue
.getStr() ) );
2277 return RTL_TEXTENCODING_DONTKNOW
;
2280 rtl_TextEncoding
HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator
*pHTTPHeader
)
2282 rtl_TextEncoding eRet
= RTL_TEXTENCODING_DONTKNOW
;
2286 for( bool bCont
= pHTTPHeader
->GetFirst( aKV
); bCont
;
2287 bCont
= pHTTPHeader
->GetNext( aKV
) )
2289 if( aKV
.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type
) )
2291 if( aKV
.GetValue().Len() )
2293 eRet
= HTMLParser::GetEncodingByMIME( aKV
.GetValue() );
2301 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator
*pHTTPHeader
)
2304 rtl_TextEncoding eEnc
= HTMLParser::GetEncodingByHttpHeader( pHTTPHeader
);
2305 if(RTL_TEXTENCODING_DONTKNOW
!= eEnc
)
2307 SetSrcEncoding( eEnc
);
2314 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */