fdo#74697 Add Bluez 5 support for impress remote.
[LibreOffice.git] / svtools / source / svhtml / parhtml.cxx
blob90809ecec9b389df5167f3daadf40250c401f5be
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
21 #include <ctype.h>
22 #include <stdio.h>
23 #include <comphelper/string.hxx>
24 #include <tools/stream.hxx>
25 #include <tools/debug.hxx>
26 #include <tools/color.hxx>
27 #include <rtl/ustrbuf.hxx>
28 #include <rtl/strbuf.hxx>
30 #include <tools/tenccvt.hxx>
31 #include <tools/datetime.hxx>
32 #include <svl/inettype.hxx>
33 #include <com/sun/star/beans/PropertyAttribute.hpp>
34 #include <com/sun/star/document/XDocumentProperties.hpp>
36 #include <svtools/parhtml.hxx>
37 #include <svtools/htmltokn.h>
38 #include <svtools/htmlkywd.hxx>
40 #include <memory>
42 using namespace ::com::sun::star;
45 const sal_Int32 MAX_LEN( 1024L );
47 const sal_Int32 MAX_ENTITY_LEN( 8L );
50 // Tables to convert option values into strings
52 // <INPUT TYPE=xxx>
53 static HTMLOptionEnum const aInputTypeOptEnums[] =
55 { OOO_STRING_SVTOOLS_HTML_IT_text, HTML_IT_TEXT },
56 { OOO_STRING_SVTOOLS_HTML_IT_password, HTML_IT_PASSWORD },
57 { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTML_IT_CHECKBOX },
58 { OOO_STRING_SVTOOLS_HTML_IT_radio, HTML_IT_RADIO },
59 { OOO_STRING_SVTOOLS_HTML_IT_range, HTML_IT_RANGE },
60 { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTML_IT_SCRIBBLE },
61 { OOO_STRING_SVTOOLS_HTML_IT_file, HTML_IT_FILE },
62 { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTML_IT_HIDDEN },
63 { OOO_STRING_SVTOOLS_HTML_IT_submit, HTML_IT_SUBMIT },
64 { OOO_STRING_SVTOOLS_HTML_IT_image, HTML_IT_IMAGE },
65 { OOO_STRING_SVTOOLS_HTML_IT_reset, HTML_IT_RESET },
66 { OOO_STRING_SVTOOLS_HTML_IT_button, HTML_IT_BUTTON },
67 { 0, 0 }
70 // <TABLE FRAME=xxx>
71 static HTMLOptionEnum const aTableFrameOptEnums[] =
73 { OOO_STRING_SVTOOLS_HTML_TF_void, HTML_TF_VOID },
74 { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE },
75 { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW },
76 { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTML_TF_HSIDES },
77 { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTML_TF_LHS },
78 { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTML_TF_RHS },
79 { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTML_TF_VSIDES },
80 { OOO_STRING_SVTOOLS_HTML_TF_box, HTML_TF_BOX },
81 { OOO_STRING_SVTOOLS_HTML_TF_border, HTML_TF_BOX },
82 { 0, 0 }
85 // <TABLE RULES=xxx>
86 static HTMLOptionEnum const aTableRulesOptEnums[] =
88 { OOO_STRING_SVTOOLS_HTML_TR_none, HTML_TR_NONE },
89 { OOO_STRING_SVTOOLS_HTML_TR_groups, HTML_TR_GROUPS },
90 { OOO_STRING_SVTOOLS_HTML_TR_rows, HTML_TR_ROWS },
91 { OOO_STRING_SVTOOLS_HTML_TR_cols, HTML_TR_COLS },
92 { OOO_STRING_SVTOOLS_HTML_TR_all, HTML_TR_ALL },
93 { 0, 0 }
96 sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const
98 sal_uInt16 nValue = nDflt;
100 while( pOptEnums->pName )
101 if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
102 break;
103 else
104 pOptEnums++;
106 if( pOptEnums->pName )
107 nValue = pOptEnums->nValue;
109 return nValue;
112 bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const
114 while( pOptEnums->pName )
116 if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
117 break;
118 else
119 pOptEnums++;
122 const sal_Char *pName = pOptEnums->pName;
123 if( pName )
124 rEnum = pOptEnums->nValue;
126 return (pName != 0);
129 HTMLOption::HTMLOption( sal_uInt16 nTok, const String& rToken,
130 const String& rValue )
131 : aValue(rValue)
132 , aToken(rToken)
133 , nToken( nTok )
135 DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END,
136 "HTMLOption: unknown token" );
139 sal_uInt32 HTMLOption::GetNumber() const
141 DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START &&
142 nToken<HTML_OPTION_NUMBER_END) ||
143 (nToken>=HTML_OPTION_CONTEXT_START &&
144 nToken<HTML_OPTION_CONTEXT_END) ||
145 nToken==HTML_O_VALUE,
146 "GetNumber: Option not numerical" );
147 String aTmp(comphelper::string::stripStart(aValue, ' '));
148 sal_Int32 nTmp = aTmp.ToInt32();
149 return nTmp >= 0 ? (sal_uInt32)nTmp : 0;
152 sal_Int32 HTMLOption::GetSNumber() const
154 DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) ||
155 (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END),
156 "GetSNumber: Option not numerical" );
157 String aTmp(comphelper::string::stripStart(aValue, ' '));
158 return aTmp.ToInt32();
161 void HTMLOption::GetNumbers( std::vector<sal_uInt32> &rNumbers, bool bSpaceDelim ) const
163 rNumbers.clear();
165 if( bSpaceDelim )
167 // This is a very simplified scanner: it only searches all
168 // numerals in the string.
169 bool bInNum = false;
170 sal_uLong nNum = 0;
171 for( xub_StrLen i=0; i<aValue.Len(); i++ )
173 register sal_Unicode c = aValue.GetChar( i );
174 if( c>='0' && c<='9' )
176 nNum *= 10;
177 nNum += (c - '0');
178 bInNum = true;
180 else if( bInNum )
182 rNumbers.push_back( nNum );
183 bInNum = false;
184 nNum = 0;
187 if( bInNum )
189 rNumbers.push_back( nNum );
192 else
194 // Check whether numbers are separated by ',' and
195 // insert 0 if necessary
196 xub_StrLen nPos = 0;
197 while( nPos < aValue.Len() )
199 register sal_Unicode c;
200 while( nPos < aValue.Len() &&
201 ((c=aValue.GetChar(nPos)) == ' ' || c == '\t' ||
202 c == '\n' || c== '\r' ) )
203 nPos++;
205 if( nPos==aValue.Len() )
206 rNumbers.push_back(0);
207 else
209 xub_StrLen nEnd = aValue.Search( (sal_Unicode)',', nPos );
210 if( STRING_NOTFOUND==nEnd )
212 sal_Int32 nTmp = aValue.Copy(nPos).ToInt32();
213 rNumbers.push_back( nTmp >= 0 ? (sal_uInt32)nTmp : 0 );
214 nPos = aValue.Len();
216 else
218 sal_Int32 nTmp =
219 aValue.Copy(nPos,nEnd-nPos).ToInt32();
220 rNumbers.push_back( nTmp >= 0 ? (sal_uInt32)nTmp : 0 );
221 nPos = nEnd+1;
228 void HTMLOption::GetColor( Color& rColor ) const
230 DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE,
231 "GetColor: Option is not a color." );
233 String aTmp( aValue );
234 aTmp.ToUpperAscii();
235 sal_uInt32 nColor = SAL_MAX_UINT32;
236 if( '#'!=aTmp.GetChar( 0 ) )
237 nColor = GetHTMLColor( aTmp );
239 if( SAL_MAX_UINT32 == nColor )
241 nColor = 0;
242 xub_StrLen nPos = 0;
243 for( sal_uInt32 i=0; i<6; i++ )
245 // Whatever Netscape does to get color values,
246 // at maximum three characters < '0' are ignored.
247 register sal_Unicode c = nPos<aTmp.Len() ? aTmp.GetChar( nPos++ )
248 : '0';
249 if( c < '0' )
251 c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
252 if( c < '0' )
253 c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
255 nColor *= 16;
256 if( c >= '0' && c <= '9' )
257 nColor += (c - 48);
258 else if( c >= 'A' && c <= 'F' )
259 nColor += (c - 55);
263 rColor.SetRed( (sal_uInt8)((nColor & 0x00ff0000) >> 16) );
264 rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8));
265 rColor.SetBlue( (sal_uInt8)(nColor & 0x000000ff) );
268 HTMLInputType HTMLOption::GetInputType() const
270 DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option not TYPE" );
271 return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT );
274 HTMLTableFrame HTMLOption::GetTableFrame() const
276 DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option not FRAME" );
277 return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID );
280 HTMLTableRules HTMLOption::GetTableRules() const
282 DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option not RULES" );
283 return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE );
286 HTMLParser::HTMLParser( SvStream& rIn, bool bReadNewDoc ) :
287 SvParser( rIn ),
288 bNewDoc(bReadNewDoc),
289 bIsInHeader(true),
290 bIsInBody(false),
291 bReadListing(false),
292 bReadXMP(false),
293 bReadPRE(false),
294 bReadTextArea(false),
295 bReadScript(false),
296 bReadStyle(false),
297 bEndTokenFound(false),
298 bPre_IgnoreNewPara(false),
299 bReadNextChar(false),
300 bReadComment(false),
301 mnPendingOffToken(0)
303 //#i76649, default to UTF-8 for HTML unless we know differently
304 SetSrcEncoding(RTL_TEXTENCODING_UTF8);
307 HTMLParser::~HTMLParser()
311 SvParserState HTMLParser::CallParser()
313 eState = SVPAR_WORKING;
314 nNextCh = GetNextChar();
315 SaveState( 0 );
317 nPre_LinePos = 0;
318 bPre_IgnoreNewPara = false;
320 AddRef();
321 Continue( 0 );
322 if( SVPAR_PENDING != eState )
323 ReleaseRef(); // Parser not needed anymore
325 return eState;
328 void HTMLParser::Continue( int nToken )
330 if( !nToken )
331 nToken = GetNextToken();
333 while( IsParserWorking() )
335 SaveState( nToken );
336 nToken = FilterToken( nToken );
338 if( nToken )
339 NextToken( nToken );
341 if( IsParserWorking() )
342 SaveState( 0 ); // continue with new token
344 nToken = GetNextToken();
348 int HTMLParser::FilterToken( int nToken )
350 switch( nToken )
352 case sal_Unicode(EOF):
353 nToken = 0;
354 break; // don't pass
356 case HTML_HEAD_OFF:
357 bIsInBody = true;
358 case HTML_HEAD_ON:
359 bIsInHeader = HTML_HEAD_ON == nToken;
360 break;
362 case HTML_BODY_ON:
363 case HTML_FRAMESET_ON:
364 bIsInHeader = false;
365 bIsInBody = HTML_BODY_ON == nToken;
366 break;
368 case HTML_BODY_OFF:
369 bIsInBody = bReadPRE = bReadListing = bReadXMP = false;
370 break;
372 case HTML_HTML_OFF:
373 nToken = 0;
374 bReadPRE = bReadListing = bReadXMP = false;
375 break; // HTML_ON hasn't been passed either !
377 case HTML_PREFORMTXT_ON:
378 StartPRE();
379 break;
381 case HTML_PREFORMTXT_OFF:
382 FinishPRE();
383 break;
385 case HTML_LISTING_ON:
386 StartListing();
387 break;
389 case HTML_LISTING_OFF:
390 FinishListing();
391 break;
393 case HTML_XMP_ON:
394 StartXMP();
395 break;
397 case HTML_XMP_OFF:
398 FinishXMP();
399 break;
401 default:
402 if( bReadPRE )
403 nToken = FilterPRE( nToken );
404 else if( bReadListing )
405 nToken = FilterListing( nToken );
406 else if( bReadXMP )
407 nToken = FilterXMP( nToken );
409 break;
412 return nToken;
415 #define HTML_ISDIGIT( c ) comphelper::string::isdigitAscii(c)
416 #define HTML_ISALPHA( c ) comphelper::string::isalphaAscii(c)
417 #define HTML_ISALNUM( c ) comphelper::string::isalnumAscii(c)
418 #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) )
419 #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127)
420 #define HTML_ISHEXDIGIT( c ) comphelper::string::isxdigitAscii(c)
422 int HTMLParser::ScanText( const sal_Unicode cBreak )
424 OUStringBuffer sTmpBuffer( MAX_LEN );
425 int bContinue = true;
426 int bEqSignFound = false;
427 sal_Unicode cQuote = 0U;
429 while( bContinue && IsParserWorking() )
431 int bNextCh = true;
432 switch( nNextCh )
434 case '&':
435 bEqSignFound = false;
436 if( bReadXMP )
437 sTmpBuffer.append( (sal_Unicode)'&' );
438 else
440 sal_uLong nStreamPos = rInput.Tell();
441 sal_uLong nLinePos = GetLinePos();
443 sal_Unicode cChar = 0U;
444 if( '#' == (nNextCh = GetNextChar()) )
446 nNextCh = GetNextChar();
447 const bool bIsHex( 'x' == nNextCh );
448 const bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) );
449 if ( bIsDecOrHex )
451 if ( bIsHex )
453 nNextCh = GetNextChar();
454 while ( HTML_ISHEXDIGIT(nNextCh) )
456 cChar = cChar * 16U +
457 ( nNextCh <= '9'
458 ? sal_Unicode( nNextCh - '0' )
459 : ( nNextCh <= 'F'
460 ? sal_Unicode( nNextCh - 'A' + 10 )
461 : sal_Unicode( nNextCh - 'a' + 10 ) ) );
462 nNextCh = GetNextChar();
465 else
469 cChar = cChar * 10U + sal_Unicode( nNextCh - '0');
470 nNextCh = GetNextChar();
472 while( HTML_ISDIGIT(nNextCh) );
475 if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
476 RTL_TEXTENCODING_UCS2 != eSrcEnc &&
477 RTL_TEXTENCODING_UTF8 != eSrcEnc &&
478 cChar < 256 )
480 const sal_uInt32 convertFlags =
481 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
482 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
483 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT;
485 sal_Char cEncodedChar = static_cast<sal_Char>(cChar);
486 cChar = OUString(&cEncodedChar, 1, eSrcEnc, convertFlags).toChar();
487 if( 0U == cChar )
489 // If the character could not be
490 // converted, because a conversion is not
491 // available, do no conversion at all.
492 cChar = cEncodedChar;
496 else
497 nNextCh = 0U;
499 else if( HTML_ISALPHA( nNextCh ) )
501 OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
502 xub_StrLen nPos = 0L;
505 sEntityBuffer.append( nNextCh );
506 nPos++;
507 nNextCh = GetNextChar();
509 while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) &&
510 !rInput.IsEof() );
512 if( IsParserWorking() && !rInput.IsEof() )
514 OUString sEntity(sEntityBuffer.getStr(), nPos);
515 cChar = GetHTMLCharName( sEntity );
517 // not found ( == 0 ): plain text
518 // or a character which is inserted as attribute
519 if( 0U == cChar && ';' != nNextCh )
521 DBG_ASSERT( rInput.Tell() - nStreamPos ==
522 (sal_uLong)(nPos+1L)*GetCharSize(),
523 "UTF-8 is failing here" );
524 for( xub_StrLen i=nPos-1L; i>1L; i-- )
526 nNextCh = sEntityBuffer[i];
527 sEntityBuffer.setLength( i );
528 sEntity = OUString(sEntityBuffer.getStr(), i);
529 cChar = GetHTMLCharName( sEntity );
530 if( cChar )
532 rInput.SeekRel( -(long)
533 ((nPos-i)*GetCharSize()) );
534 nlLinePos -= sal_uInt32(nPos-i);
535 nPos = i;
536 ClearTxtConvContext();
537 break;
542 if( !cChar ) // unknown character?
544 // back in stream, insert '&'
545 // and restart with next character
546 sTmpBuffer.append( (sal_Unicode)'&' );
548 DBG_ASSERT( rInput.Tell()-nStreamPos ==
549 (sal_uLong)(nPos+1)*GetCharSize(),
550 "Wrong stream position" );
551 DBG_ASSERT( nlLinePos-nLinePos ==
552 (sal_uLong)(nPos+1),
553 "Wrong line position" );
554 rInput.Seek( nStreamPos );
555 nlLinePos = nLinePos;
556 ClearTxtConvContext();
557 break;
560 // 1 == Non Breaking Space
561 // 2 == SoftHyphen
563 if( cChar < 3U )
565 if( '>' == cBreak )
567 // When reading the content of a tag we have
568 // to change it to ' ' or '-'
569 switch( cChar )
571 case 1U: cChar = ' '; break;
572 case 2U: cChar = '-'; break;
573 default:
574 DBG_ASSERT( cChar==1U,
575 "\0x00 should be handled already!" );
576 break;
579 else
581 // If not scanning a tag return token
582 aToken +=
583 String( sTmpBuffer.makeStringAndClear() );
584 if( cChar )
586 if( aToken.Len() )
588 // restart with character
589 nNextCh = '&';
590 DBG_ASSERT( rInput.Tell()-nStreamPos ==
591 (sal_uLong)(nPos+1)*GetCharSize(),
592 "Wrong stream position" );
593 DBG_ASSERT( nlLinePos-nLinePos ==
594 (sal_uLong)(nPos+1),
595 "Wrong line position" );
596 rInput.Seek( nStreamPos );
597 nlLinePos = nLinePos;
598 ClearTxtConvContext();
599 return HTML_TEXTTOKEN;
602 // Hack: _GetNextChar shall not read the
603 // next character
604 if( ';' != nNextCh )
605 aToken += ' ';
606 if( 1U == cChar )
607 return HTML_NONBREAKSPACE;
608 if( 2U == cChar )
609 return HTML_SOFTHYPH;
611 aToken += (sal_Unicode)'&';
612 aToken +=
613 String(sEntityBuffer.makeStringAndClear());
614 break;
618 else
619 nNextCh = 0U;
621 // &{...};-JavaScript-Macros are not supported any longer.
622 else if( IsParserWorking() )
624 sTmpBuffer.append( (sal_Unicode)'&' );
625 bNextCh = false;
626 break;
629 bNextCh = (';' == nNextCh);
630 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
631 cChar=='\"' || cChar==' ') )
633 // ' and " have to be escaped withing tags to separate
634 // them from ' and " enclosing options.
635 // \ has to be escaped as well.
636 // Space is protected because it's not a delimiter between
637 // options.
638 sTmpBuffer.append( (sal_Unicode)'\\' );
639 if( MAX_LEN == sTmpBuffer.getLength() )
640 aToken += String(sTmpBuffer.makeStringAndClear());
642 if( IsParserWorking() )
644 if( cChar )
645 sTmpBuffer.append( cChar );
647 else if( SVPAR_PENDING==eState && '>'!=cBreak )
649 // Restart with '&', the remainder is returned as
650 // text token.
651 if( aToken.Len() || sTmpBuffer.getLength() )
653 // _GetNextChar() returns the previous text and
654 // during the next execution a new character is read.
655 // Thus we have to position in front of the '&'.
656 nNextCh = 0U;
657 rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() );
658 nlLinePos = nLinePos-1;
659 ClearTxtConvContext();
660 bReadNextChar = true;
662 bNextCh = false;
665 break;
666 case '=':
667 if( '>'==cBreak && !cQuote )
668 bEqSignFound = true;
669 sTmpBuffer.append( nNextCh );
670 break;
672 case '\\':
673 if( '>'==cBreak )
675 // Innerhalb von Tags kennzeichnen
676 sTmpBuffer.append( (sal_Unicode)'\\' );
677 if( MAX_LEN == sTmpBuffer.getLength() )
678 aToken += String(sTmpBuffer.makeStringAndClear());
680 sTmpBuffer.append( (sal_Unicode)'\\' );
681 break;
683 case '\"':
684 case '\'':
685 if( '>'==cBreak )
687 if( bEqSignFound )
688 cQuote = nNextCh;
689 else if( cQuote && (cQuote==nNextCh ) )
690 cQuote = 0U;
692 sTmpBuffer.append( nNextCh );
693 bEqSignFound = false;
694 break;
696 case sal_Unicode(EOF):
697 if( rInput.IsEof() )
699 bContinue = false;
701 else
703 sTmpBuffer.append( nNextCh );
705 break;
707 case '<':
708 bEqSignFound = false;
709 if( '>'==cBreak )
710 sTmpBuffer.append( nNextCh );
711 else
712 bContinue = false; // break, String zusammen
713 break;
715 case '\f':
716 if( '>' == cBreak )
718 // If scanning options treat it like a space, ...
719 sTmpBuffer.append( (sal_Unicode)' ' );
721 else
723 // otherwise it's a separate token.
724 bContinue = false;
726 break;
728 case '\r':
729 case '\n':
730 if( '>'==cBreak )
732 // cr/lf in tag is handled in _GetNextToken()
733 sTmpBuffer.append( nNextCh );
734 break;
736 else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
738 bContinue = false;
739 break;
741 // Reduce sequence of CR/LF/BLANK/TAB to a single blank
742 // no break!!
743 case '\t':
744 if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
746 // In <PRE>: Tabs nach oben durchreichen
747 bContinue = false;
748 break;
750 // no break
751 case '\x0b':
752 if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
753 '>'!=cBreak )
755 break;
757 nNextCh = ' ';
758 // no break;
759 case ' ':
760 sTmpBuffer.append( nNextCh );
761 if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
762 !bReadPRE && !bReadTextArea) )
764 // Reduce sequences of Blanks/Tabs/CR/LF to a single blank
765 do {
766 if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
767 rInput.IsEof() )
769 if( aToken.Len() || sTmpBuffer.getLength() > 1L )
771 // Have seen s.th. aside from blanks?
772 aToken += String(sTmpBuffer.makeStringAndClear());
773 return HTML_TEXTTOKEN;
775 else
776 // Only read blanks: no text must be returned
777 // and _GetNextToken has to read until EOF
778 return 0;
780 } while ( ' ' == nNextCh || '\t' == nNextCh ||
781 '\r' == nNextCh || '\n' == nNextCh ||
782 '\x0b' == nNextCh );
783 bNextCh = false;
785 break;
787 default:
788 bEqSignFound = false;
789 if( (nNextCh==cBreak && !cQuote) ||
790 (sal_uLong(aToken.Len()) + MAX_LEN) > sal_uLong(STRING_MAXLEN & ~1 ))
791 bContinue = false;
792 else
794 do {
795 // All remaining characters make their way into the text.
796 sTmpBuffer.append( nNextCh );
797 if( MAX_LEN == sTmpBuffer.getLength() )
799 aToken += String(sTmpBuffer.makeStringAndClear());
800 if( (sal_uLong(aToken.Len()) + MAX_LEN) >
801 sal_uLong(STRING_MAXLEN & ~1 ) )
803 nNextCh = GetNextChar();
804 return HTML_TEXTTOKEN;
807 if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
808 rInput.IsEof() ) ||
809 !IsParserWorking() )
811 if( sTmpBuffer.getLength() )
812 aToken += String(sTmpBuffer.makeStringAndClear());
813 return HTML_TEXTTOKEN;
815 } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) );
816 bNextCh = false;
820 if( MAX_LEN == sTmpBuffer.getLength() )
821 aToken += String(sTmpBuffer.makeStringAndClear());
823 if( bContinue && bNextCh )
824 nNextCh = GetNextChar();
827 if( sTmpBuffer.getLength() )
828 aToken += String(sTmpBuffer.makeStringAndClear());
830 return HTML_TEXTTOKEN;
833 int HTMLParser::_GetNextRawToken()
835 OUStringBuffer sTmpBuffer( MAX_LEN );
837 if( bEndTokenFound )
839 // During the last execution we already found the end token,
840 // thus we don't have to search it again.
841 bReadScript = false;
842 bReadStyle = false;
843 aEndToken.Erase();
844 bEndTokenFound = false;
846 return 0;
849 // Default return value: HTML_RAWDATA
850 int bContinue = true;
851 int nToken = HTML_RAWDATA;
852 SaveState( 0 );
853 while( bContinue && IsParserWorking() )
855 int bNextCh = true;
856 switch( nNextCh )
858 case '<':
860 // Maybe we've reached the end.
862 // Save what we have read previously...
863 aToken += String(sTmpBuffer.makeStringAndClear());
865 // and remember position in stream.
866 sal_uLong nStreamPos = rInput.Tell();
867 sal_uLong nLineNr = GetLineNr();
868 sal_uLong nLinePos = GetLinePos();
870 // Start of an end token?
871 int bOffState = false;
872 if( '/' == (nNextCh = GetNextChar()) )
874 bOffState = true;
875 nNextCh = GetNextChar();
877 else if( '!' == nNextCh )
879 sTmpBuffer.append( nNextCh );
880 nNextCh = GetNextChar();
883 // Read following letters
884 while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) &&
885 IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
887 sTmpBuffer.append( nNextCh );
888 nNextCh = GetNextChar();
891 String aTok( sTmpBuffer.toString() );
892 aTok.ToUpperAscii();
893 bool bDone = false;
894 if( bReadScript || aEndToken.Len() )
896 if( !bReadComment )
898 if( aTok.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment, 3 )
899 == COMPARE_EQUAL )
901 bReadComment = true;
903 else
905 // A script has to end with "</SCRIPT>". But
906 // ">" is optional for security reasons
907 bDone = bOffState &&
908 COMPARE_EQUAL == ( bReadScript
909 ? aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script)
910 : aTok.CompareTo(aEndToken) );
913 if( bReadComment && '>'==nNextCh && aTok.Len() >= 2 &&
914 aTok.Copy( aTok.Len()-2 ).EqualsAscii( "--" ) )
916 // End of comment of style <!----->
917 bReadComment = false;
920 else
922 // Style sheets can be closed by </STYLE>, </HEAD> or <BODY>
923 if( bOffState )
924 bDone = aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style)
925 == COMPARE_EQUAL ||
926 aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head)
927 == COMPARE_EQUAL;
928 else
929 bDone =
930 aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body) == COMPARE_EQUAL;
933 if( bDone )
935 // Done! Return the previously read string (if requested)
936 // and continue.
938 bContinue = false;
940 // nToken==0 means, _GetNextToken continues to read
941 if( !aToken.Len() && (bReadStyle || bReadScript) )
943 // Immediately close environment (or context?)
944 // and parse the end token
945 bReadScript = false;
946 bReadStyle = false;
947 aEndToken.Erase();
948 nToken = 0;
950 else
952 // Keep bReadScript/bReadStyle alive
953 // and parse end token during next execution
954 bEndTokenFound = true;
957 // Move backwards in stream to '<'
958 rInput.Seek( nStreamPos );
959 SetLineNr( nLineNr );
960 SetLinePos( nLinePos );
961 ClearTxtConvContext();
962 nNextCh = '<';
964 // Don't append string to token.
965 sTmpBuffer.setLength( 0L );
967 else
969 // remember "</" , everything else we find in the buffer
970 aToken += (sal_Unicode)'<';
971 if( bOffState )
972 aToken += (sal_Unicode)'/';
974 bNextCh = false;
977 break;
978 case '-':
979 sTmpBuffer.append( nNextCh );
980 if( bReadComment )
982 bool bTwoMinus = false;
983 nNextCh = GetNextChar();
984 while( '-' == nNextCh && IsParserWorking() )
986 bTwoMinus = true;
988 if( MAX_LEN == sTmpBuffer.getLength() )
989 aToken += String(sTmpBuffer.makeStringAndClear());
990 sTmpBuffer.append( nNextCh );
991 nNextCh = GetNextChar();
994 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
995 bReadComment = false;
997 bNextCh = false;
999 break;
1001 case '\r':
1002 // \r\n? closes the current text token (even if it's empty)
1003 nNextCh = GetNextChar();
1004 if( nNextCh=='\n' )
1005 nNextCh = GetNextChar();
1006 bContinue = false;
1007 break;
1008 case '\n':
1009 // \n closes the current text token (even if it's empty)
1010 nNextCh = GetNextChar();
1011 bContinue = false;
1012 break;
1013 case sal_Unicode(EOF):
1014 // eof closes the current text token and behaves like having read
1015 // an end token
1016 if( rInput.IsEof() )
1018 bContinue = false;
1019 if( aToken.Len() || sTmpBuffer.getLength() )
1021 bEndTokenFound = true;
1023 else
1025 bReadScript = false;
1026 bReadStyle = false;
1027 aEndToken.Erase();
1028 nToken = 0;
1030 break;
1032 // no break
1033 default:
1034 // all remaining characters are appended to the buffer
1035 sTmpBuffer.append( nNextCh );
1036 break;
1039 if( (!bContinue && sTmpBuffer.getLength() > 0L) ||
1040 MAX_LEN == sTmpBuffer.getLength() )
1041 aToken += String(sTmpBuffer.makeStringAndClear());
1043 if( bContinue && bNextCh )
1044 nNextCh = GetNextChar();
1047 if( IsParserWorking() )
1048 SaveState( 0 );
1049 else
1050 nToken = 0;
1052 return nToken;
1055 // Scan next token
1056 int HTMLParser::_GetNextToken()
1058 int nRet = 0;
1059 sSaveToken.Erase();
1061 if (mnPendingOffToken)
1063 // HTML_<TOKEN>_OFF generated for HTML_<TOKEN>_ON
1064 nRet = mnPendingOffToken;
1065 mnPendingOffToken = 0;
1066 aToken.Erase();
1067 return nRet;
1070 // Delete options
1071 if (!maOptions.empty())
1072 maOptions.clear();
1074 if( !IsParserWorking() ) // Don't continue if already an error occurred
1075 return 0;
1077 bool bReadNextCharSave = bReadNextChar;
1078 if( bReadNextChar )
1080 DBG_ASSERT( !bEndTokenFound,
1081 "Read a character despite </SCRIPT> was read?" );
1082 nNextCh = GetNextChar();
1083 if( !IsParserWorking() ) // Don't continue if already an error occurred
1084 return 0;
1085 bReadNextChar = false;
1088 if( bReadScript || bReadStyle || aEndToken.Len() )
1090 nRet = _GetNextRawToken();
1091 if( nRet || !IsParserWorking() )
1092 return nRet;
1095 do {
1096 int bNextCh = true;
1097 switch( nNextCh )
1099 case '<':
1101 sal_uLong nStreamPos = rInput.Tell();
1102 sal_uLong nLineNr = GetLineNr();
1103 sal_uLong nLinePos = GetLinePos();
1105 int bOffState = false;
1106 if( '/' == (nNextCh = GetNextChar()) )
1108 bOffState = true;
1109 nNextCh = GetNextChar();
1111 if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh )
1113 OUStringBuffer sTmpBuffer;
1114 do {
1115 sTmpBuffer.append( nNextCh );
1116 if( MAX_LEN == sTmpBuffer.getLength() )
1117 aToken += String(sTmpBuffer.makeStringAndClear());
1118 nNextCh = GetNextChar();
1119 } while( '>' != nNextCh && '/' != nNextCh && !HTML_ISSPACE( nNextCh ) &&
1120 IsParserWorking() && !rInput.IsEof() );
1122 if( sTmpBuffer.getLength() )
1123 aToken += String(sTmpBuffer.makeStringAndClear());
1125 // Skip blanks
1126 while( HTML_ISSPACE( nNextCh ) && IsParserWorking() )
1127 nNextCh = GetNextChar();
1129 if( !IsParserWorking() )
1131 if( SVPAR_PENDING == eState )
1132 bReadNextChar = bReadNextCharSave;
1133 break;
1136 // Search token in table:
1137 sSaveToken = aToken;
1138 aToken.ToUpperAscii();
1139 if( 0 == (nRet = GetHTMLToken( aToken )) )
1140 // Unknown control
1141 nRet = HTML_UNKNOWNCONTROL_ON;
1143 // If it's a token which can be switched off...
1144 if( bOffState )
1146 if( HTML_TOKEN_ONOFF & nRet )
1148 // and there is an off token, return off token instead
1149 ++nRet;
1151 else if( HTML_LINEBREAK!=nRet )
1153 // and there is no off token, return unknown token.
1154 // (except for </BR>, that is treated like <BR>)
1155 nRet = HTML_UNKNOWNCONTROL_OFF;
1159 if( nRet == HTML_COMMENT )
1161 // fix: due to being case sensitive use sSaveToken as start of comment
1162 // and append a blank.
1163 aToken = sSaveToken;
1164 if( '>'!=nNextCh )
1165 aToken += (sal_Unicode)' ';
1166 sal_uLong nCStreamPos = 0;
1167 sal_uLong nCLineNr = 0;
1168 sal_uLong nCLinePos = 0;
1169 xub_StrLen nCStrLen = 0;
1171 bool bDone = false;
1172 // Read until closing -->. If not found restart at first >
1173 while( !bDone && !rInput.IsEof() && IsParserWorking() )
1175 if( '>'==nNextCh )
1177 if( !nCStreamPos )
1179 nCStreamPos = rInput.Tell();
1180 nCStrLen = aToken.Len();
1181 nCLineNr = GetLineNr();
1182 nCLinePos = GetLinePos();
1184 bDone = aToken.Len() >= 2 &&
1185 aToken.Copy(aToken.Len()-2,2).
1186 EqualsAscii( "--" );
1187 if( !bDone )
1188 aToken += nNextCh;
1190 else
1191 aToken += nNextCh;
1192 if( !bDone )
1193 nNextCh = GetNextChar();
1195 if( !bDone && IsParserWorking() && nCStreamPos )
1197 rInput.Seek( nCStreamPos );
1198 SetLineNr( nCLineNr );
1199 SetLinePos( nCLinePos );
1200 ClearTxtConvContext();
1201 aToken.Erase( nCStrLen );
1202 nNextCh = '>';
1205 else
1207 // TokenString not needed anymore
1208 aToken.Erase();
1211 // Read until closing '>'
1212 if( '>' != nNextCh && IsParserWorking() )
1214 ScanText( '>' );
1216 // fdo#34666 fdo#36080 fdo#36390: closing "/>"?:
1217 // generate pending HTML_<TOKEN>_OFF for HTML_<TOKEN>_ON
1218 // Do not convert this to a single HTML_<TOKEN>_OFF
1219 // which lead to fdo#56772.
1220 if ((HTML_TOKEN_ONOFF & nRet) && (aToken.Len() >= 1) &&
1221 ('/' == aToken.GetChar(aToken.Len()-1)))
1223 mnPendingOffToken = nRet + 1; // HTML_<TOKEN>_ON -> HTML_<TOKEN>_OFF
1224 aToken.Erase( aToken.Len()-1, 1); // remove trailing '/'
1226 if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1228 // Move back in front of < and restart there.
1229 // Return < as text.
1230 rInput.Seek( nStreamPos );
1231 SetLineNr( nLineNr );
1232 SetLinePos( nLinePos );
1233 ClearTxtConvContext();
1235 aToken = '<';
1236 nRet = HTML_TEXTTOKEN;
1237 nNextCh = GetNextChar();
1238 bNextCh = false;
1239 break;
1242 if( SVPAR_PENDING == eState )
1243 bReadNextChar = bReadNextCharSave;
1245 else
1247 if( bOffState )
1249 // einfach alles wegschmeissen
1250 ScanText( '>' );
1251 if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1253 // Move back in front of < and restart there.
1254 // Return < as text.
1255 rInput.Seek( nStreamPos );
1256 SetLineNr( nLineNr );
1257 SetLinePos( nLinePos );
1258 ClearTxtConvContext();
1260 aToken = '<';
1261 nRet = HTML_TEXTTOKEN;
1262 nNextCh = GetNextChar();
1263 bNextCh = false;
1264 break;
1266 if( SVPAR_PENDING == eState )
1267 bReadNextChar = bReadNextCharSave;
1268 aToken.Erase();
1270 else if( '%' == nNextCh )
1272 nRet = HTML_UNKNOWNCONTROL_ON;
1274 sal_uLong nCStreamPos = rInput.Tell();
1275 sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1277 bool bDone = false;
1278 // Read until closing %>. If not found restart at first >.
1279 while( !bDone && !rInput.IsEof() && IsParserWorking() )
1281 bDone = '>'==nNextCh && aToken.Len() >= 1 &&
1282 '%' == aToken.GetChar( aToken.Len()-1 );
1283 if( !bDone )
1285 aToken += nNextCh;
1286 nNextCh = GetNextChar();
1289 if( !bDone && IsParserWorking() )
1291 rInput.Seek( nCStreamPos );
1292 SetLineNr( nCLineNr );
1293 SetLinePos( nCLinePos );
1294 ClearTxtConvContext();
1295 aToken.AssignAscii( "<%", 2 );
1296 nRet = HTML_TEXTTOKEN;
1297 break;
1299 if( IsParserWorking() )
1301 sSaveToken = aToken;
1302 aToken.Erase();
1305 else
1307 aToken = '<';
1308 nRet = HTML_TEXTTOKEN;
1309 bNextCh = false;
1310 break;
1314 if( IsParserWorking() )
1316 bNextCh = '>' == nNextCh;
1317 switch( nRet )
1319 case HTML_TEXTAREA_ON:
1320 bReadTextArea = true;
1321 break;
1322 case HTML_TEXTAREA_OFF:
1323 bReadTextArea = false;
1324 break;
1325 case HTML_SCRIPT_ON:
1326 if( !bReadTextArea )
1327 bReadScript = true;
1328 break;
1329 case HTML_SCRIPT_OFF:
1330 if( !bReadTextArea )
1332 bReadScript = false;
1333 // JavaScript might modify the stream,
1334 // thus the last character has to be read again.
1335 bReadNextChar = true;
1336 bNextCh = false;
1338 break;
1340 case HTML_STYLE_ON:
1341 bReadStyle = true;
1342 break;
1343 case HTML_STYLE_OFF:
1344 bReadStyle = false;
1345 break;
1349 break;
1351 case sal_Unicode(EOF):
1352 if( rInput.IsEof() )
1354 eState = SVPAR_ACCEPTED;
1355 nRet = nNextCh;
1357 else
1359 // Read normal text.
1360 goto scan_text;
1362 break;
1364 case '\f':
1365 // form feeds are passed upwards separately
1366 nRet = HTML_LINEFEEDCHAR; // !!! should be FORMFEEDCHAR
1367 break;
1369 case '\n':
1370 case '\r':
1371 if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1373 sal_Unicode c = GetNextChar();
1374 if( ( '\n' != nNextCh || '\r' != c ) &&
1375 ( '\r' != nNextCh || '\n' != c ) )
1377 bNextCh = false;
1378 nNextCh = c;
1380 nRet = HTML_NEWPARA;
1381 break;
1383 // no break !
1384 case '\t':
1385 if( bReadPRE )
1387 nRet = HTML_TABCHAR;
1388 break;
1390 // no break !
1391 case ' ':
1392 // no break !
1393 default:
1395 scan_text:
1396 // "normal" text to come
1397 nRet = ScanText();
1398 bNextCh = 0 == aToken.Len();
1400 // the text should be processed
1401 if( !bNextCh && eState == SVPAR_PENDING )
1403 eState = SVPAR_WORKING;
1404 bReadNextChar = true;
1407 break;
1410 if( bNextCh && SVPAR_WORKING == eState )
1412 nNextCh = GetNextChar();
1413 if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet )
1415 bReadNextChar = true;
1416 eState = SVPAR_WORKING;
1420 } while( !nRet && SVPAR_WORKING == eState );
1422 if( SVPAR_PENDING == eState )
1423 nRet = -1; // s.th. invalid
1425 return nRet;
1428 void HTMLParser::UnescapeToken()
1430 xub_StrLen nPos=0;
1432 bool bEscape = false;
1433 while( nPos < aToken.Len() )
1435 bool bOldEscape = bEscape;
1436 bEscape = false;
1437 if( '\\'==aToken.GetChar(nPos) && !bOldEscape )
1439 aToken.Erase( nPos, 1 );
1440 bEscape = true;
1442 else
1444 nPos++;
1449 const HTMLOptions& HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken ) const
1451 // If the options for the current token have already been returned,
1452 // return them once again.
1453 if (!maOptions.empty())
1454 return maOptions;
1456 xub_StrLen nPos = 0;
1457 while( nPos < aToken.Len() )
1459 // A letter? Option beginning here.
1460 if( HTML_ISALPHA( aToken.GetChar(nPos) ) )
1462 int nToken;
1463 String aValue;
1464 xub_StrLen nStt = nPos;
1465 sal_Unicode cChar = 0;
1467 // Actually only certain characters allowed.
1468 // Netscape only looks for "=" and white space (c.f.
1469 // Mozilla: PA_FetchRequestedNameValues in lipparse/pa_mdl.c)
1470 while( nPos < aToken.Len() && '=' != (cChar=aToken.GetChar(nPos)) &&
1471 HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) )
1472 nPos++;
1474 String sName( aToken.Copy( nStt, nPos-nStt ) );
1476 // PlugIns require original token name. Convert to upper case only for searching.
1477 String sNameUpperCase( sName );
1478 sNameUpperCase.ToUpperAscii();
1480 nToken = GetHTMLOption( sNameUpperCase ); // Name is ready
1481 DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN,
1482 "GetOption: unknown HTML option" );
1483 bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START ||
1484 nToken >= HTML_OPTION_SCRIPT_END) &&
1485 (!pNoConvertToken || nToken != *pNoConvertToken);
1487 while( nPos < aToken.Len() &&
1488 ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1489 HTML_ISSPACE(cChar) ) )
1490 nPos++;
1492 // Option with value?
1493 if( nPos!=aToken.Len() && '='==cChar )
1495 nPos++;
1497 while( nPos < aToken.Len() &&
1498 ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1499 ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) )
1500 nPos++;
1502 if( nPos != aToken.Len() )
1504 xub_StrLen nLen = 0;
1505 nStt = nPos;
1506 if( ('"'==cChar) || ('\'')==cChar )
1508 sal_Unicode cEnd = cChar;
1509 nPos++; nStt++;
1510 bool bDone = false;
1511 bool bEscape = false;
1512 while( nPos < aToken.Len() && !bDone )
1514 bool bOldEscape = bEscape;
1515 bEscape = false;
1516 cChar = aToken.GetChar(nPos);
1517 switch( cChar )
1519 case '\r':
1520 case '\n':
1521 if( bStripCRLF )
1522 ((String &)aToken).Erase( nPos, 1 );
1523 else
1524 nPos++, nLen++;
1525 break;
1526 case '\\':
1527 if( bOldEscape )
1529 nPos++, nLen++;
1531 else
1533 ((String &)aToken).Erase( nPos, 1 );
1534 bEscape = true;
1536 break;
1537 case '"':
1538 case '\'':
1539 bDone = !bOldEscape && cChar==cEnd;
1540 if( !bDone )
1541 nPos++, nLen++;
1542 break;
1543 default:
1544 nPos++, nLen++;
1545 break;
1548 if( nPos!=aToken.Len() )
1549 nPos++;
1551 else
1553 // More liberal than the standard: allow all printable characters
1554 bool bEscape = false;
1555 bool bDone = false;
1556 while( nPos < aToken.Len() && !bDone )
1558 bool bOldEscape = bEscape;
1559 bEscape = false;
1560 sal_Unicode c = aToken.GetChar(nPos);
1561 switch( c )
1563 case ' ':
1564 bDone = !bOldEscape;
1565 if( !bDone )
1566 nPos++, nLen++;
1567 break;
1569 case '\t':
1570 case '\r':
1571 case '\n':
1572 bDone = true;
1573 break;
1575 case '\\':
1576 if( bOldEscape )
1578 nPos++, nLen++;
1580 else
1582 ((String &)aToken).Erase( nPos, 1 );
1583 bEscape = true;
1585 break;
1587 default:
1588 if( HTML_ISPRINTABLE( c ) )
1589 nPos++, nLen++;
1590 else
1591 bDone = true;
1592 break;
1597 if( nLen )
1598 aValue = aToken.Copy( nStt, nLen );
1602 // Token is known and can be saved
1603 std::auto_ptr<HTMLOption> pOption(
1604 new HTMLOption(sal::static_int_cast<sal_uInt16>(nToken), sName, aValue));
1606 maOptions.push_back(pOption);
1608 else
1609 // Ignore white space and unexpected characters
1610 nPos++;
1613 return maOptions;
1616 int HTMLParser::FilterPRE( int nToken )
1618 switch( nToken )
1620 #ifdef HTML_BEHAVIOUR
1621 // These become LFs according to the definition
1622 case HTML_PARABREAK_ON:
1623 case HTML_LINEBREAK:
1624 nToken = HTML_NEWPARA;
1625 #else
1626 // in Netscape they only have impact in not empty paragraphs
1627 case HTML_PARABREAK_ON:
1628 nToken = HTML_LINEBREAK;
1629 case HTML_LINEBREAK:
1630 #endif
1631 case HTML_NEWPARA:
1632 nPre_LinePos = 0;
1633 if( bPre_IgnoreNewPara )
1634 nToken = 0;
1635 break;
1637 case HTML_TABCHAR:
1639 sal_Int32 nSpaces = (8 - (nPre_LinePos % 8));
1640 DBG_ASSERT( !aToken.Len(), "Why is the token not empty?" );
1641 if (aToken.Len() < nSpaces)
1643 using comphelper::string::padToLength;
1644 OUStringBuffer aBuf(aToken);
1645 aToken = padToLength(aBuf, nSpaces, ' ').makeStringAndClear();
1647 nPre_LinePos += nSpaces;
1648 nToken = HTML_TEXTTOKEN;
1650 break;
1651 // Keep those
1652 case HTML_TEXTTOKEN:
1653 nPre_LinePos += aToken.Len();
1654 break;
1656 case HTML_SELECT_ON:
1657 case HTML_SELECT_OFF:
1658 case HTML_BODY_ON:
1659 case HTML_FORM_ON:
1660 case HTML_FORM_OFF:
1661 case HTML_INPUT:
1662 case HTML_OPTION:
1663 case HTML_TEXTAREA_ON:
1664 case HTML_TEXTAREA_OFF:
1666 case HTML_IMAGE:
1667 case HTML_APPLET_ON:
1668 case HTML_APPLET_OFF:
1669 case HTML_PARAM:
1670 case HTML_EMBED:
1672 case HTML_HEAD1_ON:
1673 case HTML_HEAD1_OFF:
1674 case HTML_HEAD2_ON:
1675 case HTML_HEAD2_OFF:
1676 case HTML_HEAD3_ON:
1677 case HTML_HEAD3_OFF:
1678 case HTML_HEAD4_ON:
1679 case HTML_HEAD4_OFF:
1680 case HTML_HEAD5_ON:
1681 case HTML_HEAD5_OFF:
1682 case HTML_HEAD6_ON:
1683 case HTML_HEAD6_OFF:
1684 case HTML_BLOCKQUOTE_ON:
1685 case HTML_BLOCKQUOTE_OFF:
1686 case HTML_ADDRESS_ON:
1687 case HTML_ADDRESS_OFF:
1688 case HTML_HORZRULE:
1690 case HTML_CENTER_ON:
1691 case HTML_CENTER_OFF:
1692 case HTML_DIVISION_ON:
1693 case HTML_DIVISION_OFF:
1695 case HTML_SCRIPT_ON:
1696 case HTML_SCRIPT_OFF:
1697 case HTML_RAWDATA:
1699 case HTML_TABLE_ON:
1700 case HTML_TABLE_OFF:
1701 case HTML_CAPTION_ON:
1702 case HTML_CAPTION_OFF:
1703 case HTML_COLGROUP_ON:
1704 case HTML_COLGROUP_OFF:
1705 case HTML_COL_ON:
1706 case HTML_COL_OFF:
1707 case HTML_THEAD_ON:
1708 case HTML_THEAD_OFF:
1709 case HTML_TFOOT_ON:
1710 case HTML_TFOOT_OFF:
1711 case HTML_TBODY_ON:
1712 case HTML_TBODY_OFF:
1713 case HTML_TABLEROW_ON:
1714 case HTML_TABLEROW_OFF:
1715 case HTML_TABLEDATA_ON:
1716 case HTML_TABLEDATA_OFF:
1717 case HTML_TABLEHEADER_ON:
1718 case HTML_TABLEHEADER_OFF:
1720 case HTML_ANCHOR_ON:
1721 case HTML_ANCHOR_OFF:
1722 case HTML_BOLD_ON:
1723 case HTML_BOLD_OFF:
1724 case HTML_ITALIC_ON:
1725 case HTML_ITALIC_OFF:
1726 case HTML_STRIKE_ON:
1727 case HTML_STRIKE_OFF:
1728 case HTML_STRIKETHROUGH_ON:
1729 case HTML_STRIKETHROUGH_OFF:
1730 case HTML_UNDERLINE_ON:
1731 case HTML_UNDERLINE_OFF:
1732 case HTML_BASEFONT_ON:
1733 case HTML_BASEFONT_OFF:
1734 case HTML_FONT_ON:
1735 case HTML_FONT_OFF:
1736 case HTML_BLINK_ON:
1737 case HTML_BLINK_OFF:
1738 case HTML_SPAN_ON:
1739 case HTML_SPAN_OFF:
1740 case HTML_SUBSCRIPT_ON:
1741 case HTML_SUBSCRIPT_OFF:
1742 case HTML_SUPERSCRIPT_ON:
1743 case HTML_SUPERSCRIPT_OFF:
1744 case HTML_BIGPRINT_ON:
1745 case HTML_BIGPRINT_OFF:
1746 case HTML_SMALLPRINT_OFF:
1747 case HTML_SMALLPRINT_ON:
1749 case HTML_EMPHASIS_ON:
1750 case HTML_EMPHASIS_OFF:
1751 case HTML_CITIATION_ON:
1752 case HTML_CITIATION_OFF:
1753 case HTML_STRONG_ON:
1754 case HTML_STRONG_OFF:
1755 case HTML_CODE_ON:
1756 case HTML_CODE_OFF:
1757 case HTML_SAMPLE_ON:
1758 case HTML_SAMPLE_OFF:
1759 case HTML_KEYBOARD_ON:
1760 case HTML_KEYBOARD_OFF:
1761 case HTML_VARIABLE_ON:
1762 case HTML_VARIABLE_OFF:
1763 case HTML_DEFINSTANCE_ON:
1764 case HTML_DEFINSTANCE_OFF:
1765 case HTML_SHORTQUOTE_ON:
1766 case HTML_SHORTQUOTE_OFF:
1767 case HTML_LANGUAGE_ON:
1768 case HTML_LANGUAGE_OFF:
1769 case HTML_AUTHOR_ON:
1770 case HTML_AUTHOR_OFF:
1771 case HTML_PERSON_ON:
1772 case HTML_PERSON_OFF:
1773 case HTML_ACRONYM_ON:
1774 case HTML_ACRONYM_OFF:
1775 case HTML_ABBREVIATION_ON:
1776 case HTML_ABBREVIATION_OFF:
1777 case HTML_INSERTEDTEXT_ON:
1778 case HTML_INSERTEDTEXT_OFF:
1779 case HTML_DELETEDTEXT_ON:
1780 case HTML_DELETEDTEXT_OFF:
1781 case HTML_TELETYPE_ON:
1782 case HTML_TELETYPE_OFF:
1784 break;
1786 // The remainder is treated as an unknown token.
1787 default:
1788 if( nToken )
1790 nToken =
1791 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1792 ? HTML_UNKNOWNCONTROL_OFF
1793 : HTML_UNKNOWNCONTROL_ON );
1795 break;
1798 bPre_IgnoreNewPara = false;
1800 return nToken;
1803 int HTMLParser::FilterXMP( int nToken )
1805 switch( nToken )
1807 case HTML_NEWPARA:
1808 if( bPre_IgnoreNewPara )
1809 nToken = 0;
1810 case HTML_TEXTTOKEN:
1811 case HTML_NONBREAKSPACE:
1812 case HTML_SOFTHYPH:
1813 break; // kept
1815 default:
1816 if( nToken )
1818 if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) )
1820 sSaveToken.Insert( '<', 0 );
1821 sSaveToken.Insert( '/', 1 );
1823 else
1824 sSaveToken.Insert( '<', 0 );
1825 if( aToken.Len() )
1827 UnescapeToken();
1828 sSaveToken += (sal_Unicode)' ';
1829 aToken.Insert( sSaveToken, 0 );
1831 else
1832 aToken = sSaveToken;
1833 aToken += (sal_Unicode)'>';
1834 nToken = HTML_TEXTTOKEN;
1836 break;
1839 bPre_IgnoreNewPara = false;
1841 return nToken;
1844 int HTMLParser::FilterListing( int nToken )
1846 switch( nToken )
1848 case HTML_NEWPARA:
1849 if( bPre_IgnoreNewPara )
1850 nToken = 0;
1851 case HTML_TEXTTOKEN:
1852 case HTML_NONBREAKSPACE:
1853 case HTML_SOFTHYPH:
1854 break; // kept
1856 default:
1857 if( nToken )
1859 nToken =
1860 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1861 ? HTML_UNKNOWNCONTROL_OFF
1862 : HTML_UNKNOWNCONTROL_ON );
1864 break;
1867 bPre_IgnoreNewPara = false;
1869 return nToken;
1872 bool HTMLParser::IsHTMLFormat( const sal_Char* pHeader,
1873 bool bSwitchToUCS2,
1874 rtl_TextEncoding eEnc )
1876 // If the string matches one of the following regular expressions then
1877 // the document is a HTML document.
1879 // ^[^<]*<[^ \t]*[> \t]
1880 // -------
1881 // ^<!
1883 // where the underlined subexpression has to be a HTML token
1884 OString sCmp;
1885 bool bUCS2B = false;
1886 if( bSwitchToUCS2 )
1888 if( 0xfeU == (sal_uChar)pHeader[0] &&
1889 0xffU == (sal_uChar)pHeader[1] )
1891 eEnc = RTL_TEXTENCODING_UCS2;
1892 bUCS2B = true;
1894 else if( 0xffU == (sal_uChar)pHeader[0] &&
1895 0xfeU == (sal_uChar)pHeader[1] )
1897 eEnc = RTL_TEXTENCODING_UCS2;
1902 RTL_TEXTENCODING_UCS2 == eEnc &&
1904 (0xfe == (sal_uChar)pHeader[0] && 0xff == (sal_uChar)pHeader[1]) ||
1905 (0xff == (sal_uChar)pHeader[0] && 0xfe == (sal_uChar)pHeader[1])
1909 if( 0xfe == (sal_uChar)pHeader[0] )
1910 bUCS2B = true;
1912 xub_StrLen nLen;
1913 for( nLen = 2;
1914 pHeader[nLen] != 0 || pHeader[nLen+1] != 0;
1915 nLen+=2 )
1918 OStringBuffer sTmp( (nLen - 2)/2 );
1919 for( xub_StrLen nPos = 2; nPos < nLen; nPos += 2 )
1921 sal_Unicode cUC;
1922 if( bUCS2B )
1923 cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1];
1924 else
1925 cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos];
1926 if( 0U == cUC )
1927 break;
1929 sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' );
1931 sCmp = sTmp.makeStringAndClear();
1933 else
1935 sCmp = pHeader;
1938 sCmp = sCmp.toAsciiUpperCase();
1940 // A HTML document must have a '<' in the first line
1941 sal_Int32 nStart = sCmp.indexOf('<');
1942 if (nStart == -1)
1943 return false;
1944 nStart++;
1946 // followed by arbitrary characters followed by a blank or '>'
1947 sal_Char c;
1948 sal_Int32 nPos;
1949 for( nPos = nStart; nPos < sCmp.getLength(); ++nPos )
1951 if( '>'==(c=sCmp[nPos]) || HTML_ISSPACE(c) )
1952 break;
1955 // If the document ends after < it's no HTML
1956 if( nPos==nStart )
1957 return false;
1959 // the string following '<' has to be a known HTML token.
1960 // <DIR> is not interpreted as HTML. Otherwise the output of the DOS command "DIR"
1961 // could be interpreted as HTML.
1962 OUString sTest(OStringToOUString(sCmp.copy(nStart, nPos-nStart), RTL_TEXTENCODING_ASCII_US));
1963 int nTok = GetHTMLToken( sTest );
1964 if( 0 != nTok && HTML_DIRLIST_ON != nTok )
1965 return true;
1967 // "<!" at the very beginning of the file?
1968 if( nStart == 1 && '!' == sCmp[1] )
1969 return true;
1971 // <HTML> somewhere in the first 80 characters of the document
1972 nStart = sCmp.indexOfL(RTL_CONSTASCII_STRINGPARAM(OOO_STRING_SVTOOLS_HTML_html));
1973 if( nStart>0 && '<'==sCmp[nStart-1] &&
1974 nStart+4 < sCmp.getLength() && '>'==sCmp[nStart+4] )
1975 return true;
1977 // Else it's rather not a HTML document
1978 return false;
1981 bool HTMLParser::InternalImgToPrivateURL( String& rURL )
1983 if( rURL.Len() < 19 || 'i' != rURL.GetChar(0) ||
1984 rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != COMPARE_EQUAL )
1985 return false;
1987 bool bFound = false;
1989 if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == COMPARE_EQUAL )
1991 String aName( rURL.Copy(16) );
1992 switch( aName.GetChar(0) )
1994 case 'b':
1995 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary );
1996 break;
1997 case 'i':
1998 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ) ||
1999 aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index );
2000 break;
2001 case 'm':
2002 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ) ||
2003 aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie );
2004 break;
2005 case 's':
2006 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound );
2007 break;
2008 case 't':
2009 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ) ||
2010 aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text );
2011 break;
2012 case 'u':
2013 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown );
2014 break;
2017 else if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == COMPARE_EQUAL )
2019 String aName( rURL.Copy(14) );
2020 switch( aName.GetChar(0) )
2022 case 'b':
2023 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata );
2024 break;
2025 case 'd':
2026 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed );
2027 break;
2028 case 'e':
2029 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed );
2030 break;
2031 case 'i':
2032 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure );
2033 break;
2034 case 'n':
2035 bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound );
2036 break;
2039 if( bFound )
2041 String sTmp ( rURL );
2042 rURL.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image );
2043 rURL.Append( sTmp );
2046 return bFound;
2049 enum eHtmlMetas {
2050 HTML_META_NONE = 0,
2051 HTML_META_AUTHOR,
2052 HTML_META_DESCRIPTION,
2053 HTML_META_KEYWORDS,
2054 HTML_META_REFRESH,
2055 HTML_META_CLASSIFICATION,
2056 HTML_META_CREATED,
2057 HTML_META_CHANGEDBY,
2058 HTML_META_CHANGED,
2059 HTML_META_GENERATOR,
2060 HTML_META_SDFOOTNOTE,
2061 HTML_META_SDENDNOTE,
2062 HTML_META_CONTENT_TYPE
2065 // <META NAME=xxx>
2066 static HTMLOptionEnum const aHTMLMetaNameTable[] =
2068 { OOO_STRING_SVTOOLS_HTML_META_author, HTML_META_AUTHOR },
2069 { OOO_STRING_SVTOOLS_HTML_META_changed, HTML_META_CHANGED },
2070 { OOO_STRING_SVTOOLS_HTML_META_changedby, HTML_META_CHANGEDBY },
2071 { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION},
2072 { OOO_STRING_SVTOOLS_HTML_META_content_type, HTML_META_CONTENT_TYPE },
2073 { OOO_STRING_SVTOOLS_HTML_META_created, HTML_META_CREATED },
2074 { OOO_STRING_SVTOOLS_HTML_META_description, HTML_META_DESCRIPTION },
2075 { OOO_STRING_SVTOOLS_HTML_META_keywords, HTML_META_KEYWORDS },
2076 { OOO_STRING_SVTOOLS_HTML_META_generator, HTML_META_GENERATOR },
2077 { OOO_STRING_SVTOOLS_HTML_META_refresh, HTML_META_REFRESH },
2078 { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HTML_META_SDENDNOTE },
2079 { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HTML_META_SDFOOTNOTE },
2080 { 0, 0 }
2084 void HTMLParser::AddMetaUserDefined( OUString const & )
2088 bool HTMLParser::ParseMetaOptionsImpl(
2089 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2090 SvKeyValueIterator *i_pHTTPHeader,
2091 const HTMLOptions& aOptions,
2092 rtl_TextEncoding& o_rEnc )
2094 String aName, aContent;
2095 sal_uInt16 nAction = HTML_META_NONE;
2096 bool bHTTPEquiv = false, bChanged = false;
2098 for ( size_t i = aOptions.size(); i; )
2100 const HTMLOption& aOption = aOptions[--i];
2101 switch ( aOption.GetToken() )
2103 case HTML_O_NAME:
2104 aName = aOption.GetString();
2105 if ( HTML_META_NONE==nAction )
2107 aOption.GetEnum( nAction, aHTMLMetaNameTable );
2109 break;
2110 case HTML_O_HTTPEQUIV:
2111 aName = aOption.GetString();
2112 aOption.GetEnum( nAction, aHTMLMetaNameTable );
2113 bHTTPEquiv = true;
2114 break;
2115 case HTML_O_CONTENT:
2116 aContent = aOption.GetString();
2117 break;
2121 if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction )
2123 // if it is not a Description, remove CRs and LFs from CONTENT
2124 aContent = comphelper::string::remove(aContent, '\r');
2125 aContent = comphelper::string::remove(aContent, '\n');
2127 else
2129 // convert line endings for Description
2130 aContent = convertLineEnd(aContent, GetSystemLineEnd());
2134 if ( bHTTPEquiv && i_pHTTPHeader )
2136 // Netscape seems to just ignore a closing ", so we do too
2137 if ( aContent.Len() && '"' == aContent.GetChar( aContent.Len()-1 ) )
2139 aContent.Erase( aContent.Len() - 1 );
2141 SvKeyValue aKeyValue( aName, aContent );
2142 i_pHTTPHeader->Append( aKeyValue );
2145 switch ( nAction )
2147 case HTML_META_AUTHOR:
2148 if (i_xDocProps.is()) {
2149 i_xDocProps->setAuthor( aContent );
2150 bChanged = true;
2152 break;
2153 case HTML_META_DESCRIPTION:
2154 if (i_xDocProps.is()) {
2155 i_xDocProps->setDescription( aContent );
2156 bChanged = true;
2158 break;
2159 case HTML_META_KEYWORDS:
2160 if (i_xDocProps.is()) {
2161 i_xDocProps->setKeywords(
2162 ::comphelper::string::convertCommaSeparated(aContent));
2163 bChanged = true;
2165 break;
2166 case HTML_META_CLASSIFICATION:
2167 if (i_xDocProps.is()) {
2168 i_xDocProps->setSubject( aContent );
2169 bChanged = true;
2171 break;
2173 case HTML_META_CHANGEDBY:
2174 if (i_xDocProps.is()) {
2175 i_xDocProps->setModifiedBy( aContent );
2177 break;
2179 case HTML_META_CREATED:
2180 case HTML_META_CHANGED:
2181 if ( i_xDocProps.is() && aContent.Len() &&
2182 comphelper::string::getTokenCount(aContent, ';') == 2 )
2184 Date aDate( (sal_uLong)aContent.GetToken(0).ToInt32() );
2185 Time aTime( (sal_uLong)aContent.GetToken(1).ToInt32() );
2186 DateTime aDateTime( aDate, aTime );
2187 ::util::DateTime uDT(aDateTime.GetNanoSec(),
2188 aDateTime.GetSec(), aDateTime.GetMin(),
2189 aDateTime.GetHour(), aDateTime.GetDay(),
2190 aDateTime.GetMonth(), aDateTime.GetYear(),
2191 false);
2192 if ( HTML_META_CREATED==nAction )
2193 i_xDocProps->setCreationDate( uDT );
2194 else
2195 i_xDocProps->setModificationDate( uDT );
2196 bChanged = true;
2198 break;
2200 case HTML_META_REFRESH:
2201 DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader,
2202 "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" );
2203 break;
2205 case HTML_META_CONTENT_TYPE:
2206 if ( aContent.Len() )
2208 o_rEnc = GetEncodingByMIME( aContent );
2210 break;
2212 case HTML_META_NONE:
2213 if ( !bHTTPEquiv )
2215 if (i_xDocProps.is())
2217 uno::Reference<beans::XPropertyContainer> xUDProps
2218 = i_xDocProps->getUserDefinedProperties();
2219 try {
2220 xUDProps->addProperty(aName,
2221 beans::PropertyAttribute::REMOVABLE,
2222 uno::makeAny(OUString(aContent)));
2223 AddMetaUserDefined(aName);
2224 bChanged = true;
2225 } catch (uno::Exception &) {
2226 // ignore
2230 break;
2231 default:
2232 break;
2235 return bChanged;
2238 bool HTMLParser::ParseMetaOptions(
2239 const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2240 SvKeyValueIterator *i_pHeader )
2242 sal_uInt16 nContentOption = HTML_O_CONTENT;
2243 rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2245 bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2246 GetOptions(&nContentOption),
2247 eEnc );
2249 // If the encoding is set by a META tag, it may only overwrite the
2250 // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2251 // encodings. Everything else cannot lead to reasonable results.
2252 if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2253 rtl_isOctetTextEncoding( eEnc ) &&
2254 rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2256 eEnc = GetExtendedCompatibilityTextEncoding( eEnc );
2257 SetSrcEncoding( eEnc );
2260 return bRet;
2263 rtl_TextEncoding HTMLParser::GetEncodingByMIME( const String& rMime )
2265 OUString sType;
2266 OUString sSubType;
2267 INetContentTypeParameterList aParameters;
2268 if (INetContentTypes::parse(rMime, sType, sSubType, &aParameters))
2270 const INetContentTypeParameter * pCharset = aParameters.find("charset");
2271 if (pCharset != 0)
2273 OString sValue(OUStringToOString(pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US));
2274 return GetExtendedCompatibilityTextEncoding( rtl_getTextEncodingFromMimeCharset( sValue.getStr() ) );
2277 return RTL_TEXTENCODING_DONTKNOW;
2280 rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2282 rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2283 if( pHTTPHeader )
2285 SvKeyValue aKV;
2286 for( bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2287 bCont = pHTTPHeader->GetNext( aKV ) )
2289 if( aKV.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2291 if( aKV.GetValue().Len() )
2293 eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2298 return eRet;
2301 bool HTMLParser::SetEncodingByHTTPHeader( SvKeyValueIterator *pHTTPHeader )
2303 bool bRet = false;
2304 rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2305 if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2307 SetSrcEncoding( eEnc );
2308 bRet = true;
2310 return bRet;
2314 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */