1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
22 #include <svtools/svtdllapi.h>
23 #include <svtools/svparser.hxx>
24 #include <svtools/htmltokn.h>
26 #include <string_view>
29 namespace com :: sun :: star :: uno
{ template <class interface_type
> class Reference
; }
31 namespace com::sun::star
{
33 class XDocumentProperties
;
38 enum class HtmlOptionId
;
40 #define HTMLFONTSZ1_DFLT 7
41 #define HTMLFONTSZ2_DFLT 10
42 #define HTMLFONTSZ3_DFLT 12
43 #define HTMLFONTSZ4_DFLT 14
44 #define HTMLFONTSZ5_DFLT 18
45 #define HTMLFONTSZ6_DFLT 24
46 #define HTMLFONTSZ7_DFLT 36
48 enum class HTMLTableFrame
{ Void
, Above
, Below
, HSides
, LHS
, RHS
, VSides
, Box
};
50 enum class HTMLTableRules
{ NONE
, Groups
, Rows
, Cols
, All
};
52 enum class HTMLInputType
68 enum class HTMLScriptLanguage
75 template<typename EnumT
>
78 const char *pName
; // value of an HTML option
79 EnumT nValue
; // and corresponding value of an enum
82 /** Representation of an HTML option (=attribute in a start tag).
83 * The values of the options are always stored as strings.
84 * The methods GetNumber,... may only be called if the option
85 * is actually numerical,...
87 class SVT_DLLPUBLIC HTMLOption
89 OUString aValue
; // value of the option (always as string)
90 OUString aToken
; // name of the option as string
91 HtmlOptionId nToken
; // and respective token
95 HTMLOption( HtmlOptionId nTyp
, OUString aToken
, OUString aValue
);
97 // name of the option...
98 HtmlOptionId
GetToken() const { return nToken
; } // ... as enum
99 const OUString
& GetTokenString() const { return aToken
; } // ... as string
101 // value of the option ...
102 const OUString
& GetString() const { return aValue
; } // ... as string
104 sal_uInt32
GetNumber() const; // ... as number
105 sal_Int32
GetSNumber() const; // ... as number
106 void GetNumbers( std::vector
<sal_uInt32
> &rNumbers
) const; // ... as numbers
107 void GetColor( Color
& ) const; // ... as color
109 template<typename EnumT
>
110 EnumT
GetEnum( const HTMLOptionEnum
<EnumT
> *pOptEnums
,
111 EnumT nDflt
= static_cast<EnumT
>(0) ) const
113 while( pOptEnums
->pName
)
115 if( aValue
.equalsIgnoreAsciiCaseAscii( pOptEnums
->pName
) )
116 return pOptEnums
->nValue
;
122 template<typename EnumT
>
123 bool GetEnum( EnumT
&rEnum
, const HTMLOptionEnum
<EnumT
> *pOptEnums
) const
125 while( pOptEnums
->pName
)
127 if( aValue
.equalsIgnoreAsciiCaseAscii( pOptEnums
->pName
) )
129 rEnum
= pOptEnums
->nValue
;
137 // ... and as a few special enums
138 HTMLInputType
GetInputType() const; // <INPUT TYPE=...>
139 HTMLTableFrame
GetTableFrame() const; // <TABLE FRAME=...>
140 HTMLTableRules
GetTableRules() const; // <TABLE RULES=...>
141 //SvxAdjust GetAdjust() const; // <P,TH,TD ALIGN=>
144 typedef ::std::vector
<HTMLOption
> HTMLOptions
;
146 class SVT_DLLPUBLIC HTMLParser
: public SvParser
<HtmlTokenId
>
149 mutable HTMLOptions maOptions
; // options of the start tag
151 bool bNewDoc
: 1; // read new Doc?
152 bool bIsInHeader
: 1; // scan header section
153 bool bReadListing
: 1; // read listings
154 bool bReadXMP
: 1; // read XMP
155 bool bReadPRE
: 1; // read preformatted text
156 bool bReadTextArea
: 1; // read TEXTAREA
157 bool bReadScript
: 1; // read <SCRIPT>
158 bool bReadStyle
: 1; // read <STYLE>
159 bool bEndTokenFound
: 1; // found </SCRIPT> or </STYLE>
161 bool bPre_IgnoreNewPara
: 1; // flags for reading of PRE paragraphs
162 bool bReadNextChar
: 1; // true: read NextChar again(JavaScript!)
163 bool bReadComment
: 1; // true: read NextChar again (JavaScript!)
165 sal_uInt32 nPre_LinePos
; // Pos in the line in the PRE-Tag
167 HtmlTokenId mnPendingOffToken
; ///< OFF token pending for a <XX.../> ON/OFF ON token
171 /// XML namespace, in case of XHTML.
172 OUString maNamespace
;
175 OUString sSaveToken
; // the read tag as string
177 HtmlTokenId
ScanText( const sal_Unicode cBreak
= 0U );
179 HtmlTokenId
GetNextRawToken();
182 virtual HtmlTokenId
GetNextToken_() override
;
184 virtual ~HTMLParser() override
;
186 void FinishHeader() { bIsInHeader
= false; }
188 void SetNamespace(std::u16string_view rNamespace
);
191 HTMLParser( SvStream
& rIn
, bool bReadNewDoc
= true );
193 virtual SvParserState
CallParser() override
;
195 bool IsNewDoc() const { return bNewDoc
; }
196 bool IsInHeader() const { return bIsInHeader
; }
197 bool IsReadListing() const { return bReadListing
; }
198 bool IsReadXMP() const { return bReadXMP
; }
199 bool IsReadPRE() const { return bReadPRE
; }
200 bool IsReadScript() const { return bReadScript
; }
201 bool IsReadStyle() const { return bReadStyle
; }
203 // start PRE-/LISTING or XMP mode or filter tags respectively
204 inline void StartPRE();
205 void FinishPRE() { bReadPRE
= false; }
206 HtmlTokenId
FilterPRE( HtmlTokenId nToken
);
208 inline void StartListing();
209 void FinishListing() { bReadListing
= false; }
210 HtmlTokenId
FilterListing( HtmlTokenId nToken
);
212 inline void StartXMP();
213 void FinishXMP() { bReadXMP
= false; }
214 HtmlTokenId
FilterXMP( HtmlTokenId nToken
);
216 void FinishTextArea() { bReadTextArea
= false; }
218 // finish PRE-/LISTING- and XMP mode
219 void FinishPREListingXMP() { bReadPRE
= bReadListing
= bReadXMP
= false; }
221 // Filter the current token according to the current mode
222 // (PRE, XMP, ...) and set the flags. Is called by Continue before
223 // NextToken is called. If you implement own loops or call
224 // NextToken yourself, you should call this method beforehand.
225 HtmlTokenId
FilterToken( HtmlTokenId nToken
);
227 void ReadRawData( const OUString
&rEndToken
) { aEndToken
= rEndToken
; }
229 // Token without \-sequences
230 void UnescapeToken();
232 // Determine the options. pNoConvertToken is the optional token
233 // of an option, for which the CR/LFs are not deleted from the value
235 const HTMLOptions
& GetOptions( HtmlOptionId
const *pNoConvertToken
=nullptr );
237 // for asynchronous reading from the SvStream
238 virtual void Continue( HtmlTokenId nToken
) override
;
243 static rtl_TextEncoding
GetEncodingByMIME( const OUString
& rMime
);
245 /// template method: called when ParseMetaOptions adds a user-defined meta
246 virtual void AddMetaUserDefined( OUString
const & i_rMetaName
);
249 /// parse meta options into XDocumentProperties and encoding
250 bool ParseMetaOptionsImpl( const css::uno::Reference
< css::document::XDocumentProperties
>&,
253 rtl_TextEncoding
& rEnc
);
256 /// overriding method must call this implementation!
257 virtual bool ParseMetaOptions( const css::uno::Reference
< css::document::XDocumentProperties
>&,
258 SvKeyValueIterator
* );
260 void ParseScriptOptions( OUString
& rLangString
, std::u16string_view rBaseURL
, HTMLScriptLanguage
& rLang
,
261 OUString
& rSrc
, OUString
& rLibrary
, OUString
& rModule
);
263 // Remove a comment around the content of <SCRIPT> or <STYLE>.
264 // The whole line behind a "<!--" might be deleted (for JavaScript).
265 static void RemoveSGMLComment( OUString
&rString
);
267 static bool InternalImgToPrivateURL( OUString
& rURL
);
268 static rtl_TextEncoding
GetEncodingByHttpHeader( SvKeyValueIterator
*pHTTPHeader
);
269 bool SetEncodingByHTTPHeader( SvKeyValueIterator
*pHTTPHeader
);
272 inline void HTMLParser::StartPRE()
275 bPre_IgnoreNewPara
= true;
279 inline void HTMLParser::StartListing()
282 bPre_IgnoreNewPara
= true;
286 inline void HTMLParser::StartXMP()
289 bPre_IgnoreNewPara
= true;
293 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */