1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include "filterdetect.hxx"
12 #include <svtools/htmltokn.h>
13 #include <tools/urlobj.hxx>
14 #include <tools/zcodec.hxx>
15 #include <ucbhelper/content.hxx>
16 #include <unotools/mediadescriptor.hxx>
17 #include <unotools/streamwrap.hxx>
18 #include <unotools/ucbstreamhelper.hxx>
20 #include <com/sun/star/io/XInputStream.hpp>
21 #include <cppuhelper/supportsservice.hxx>
24 constexpr OUString WRITER_TEXT_FILTER
= u
"Text"_ustr
;
25 constexpr OUString CALC_TEXT_FILTER
= u
"Text - txt - csv (StarCalc)"_ustr
;
27 constexpr OUStringLiteral WEB_HTML_FILTER
= u
"HTML";
28 constexpr OUStringLiteral WRITER_HTML_FILTER
= u
"HTML (StarWriter)";
29 constexpr OUStringLiteral CALC_HTML_FILTER
= u
"calc_HTML_WebQuery";
31 constexpr OUString WRITER_DOCSERVICE
= u
"com.sun.star.text.TextDocument"_ustr
;
32 constexpr OUString CALC_DOCSERVICE
= u
"com.sun.star.sheet.SpreadsheetDocument"_ustr
;
34 using namespace ::com::sun::star
;
35 using utl::MediaDescriptor
;
39 bool IsHTMLStream( const uno::Reference
<io::XInputStream
>& xInStream
)
41 std::unique_ptr
<SvStream
> pInStream( utl::UcbStreamHelper::CreateStream( xInStream
) );
42 if ( !pInStream
|| pInStream
->GetError() )
46 // Read the stream header
47 pInStream
->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW
);
48 const sal_uInt64 nUniPos
= pInStream
->Tell();
49 const sal_uInt16 nSize
= 4096;
52 if ( nUniPos
== 3 || nUniPos
== 0 ) // UTF-8 or non-Unicode
53 sHeader
= read_uInt8s_ToOString( *pInStream
, nSize
);
54 else // UTF-16 (nUniPos = 2)
55 sHeader
= OUStringToOString( read_uInt16s_ToOUString( *pInStream
, nSize
), RTL_TEXTENCODING_ASCII_US
);
57 // Now check whether the stream begins with a known HTML tag.
58 enum DetectPhase
{ BeforeTag
, TagOpened
, InTagName
};
59 DetectPhase dp
= BeforeTag
;
60 /// BeforeDeclaration -> ? -> DeclarationOpened -> > -> BeforeDeclaration.
66 DeclarationPhase eDeclaration
= BeforeDeclaration
;
68 const char* pHeader
= sHeader
.getStr();
69 const int nLength
= sHeader
.getLength();
70 int i
= 0, nStartOfTagIndex
= 0;
72 for ( i
= 0; i
< nLength
; ++i
, ++pHeader
)
75 if ((c
== ' ' || c
== '\n' || c
== '\t' || c
== '\r' || c
== '\f')
76 && eDeclaration
== BeforeDeclaration
)
78 if ( dp
== TagOpened
)
79 return false; // Invalid: Should start with a tag name
80 else if ( dp
== InTagName
)
81 break; // End of tag name reached
85 if ( dp
== BeforeTag
)
88 return false; // Invalid: Nested '<'
92 if ( dp
== InTagName
)
93 break; // End of tag name reached
94 else if (eDeclaration
== DeclarationOpened
)
97 eDeclaration
= BeforeDeclaration
;
100 return false; // Invalid: Empty tag or before '<'
104 if ( dp
== TagOpened
)
105 return true; // "<!" - DOCTYPE or comments block
107 return false; // Invalid: '!' before '<' or inside tag name
111 if ( dp
== BeforeTag
)
112 return false; // Invalid: Should start with a tag
113 else if ( dp
== TagOpened
)
115 if (c
== '?' && eDeclaration
== BeforeDeclaration
)
116 eDeclaration
= DeclarationOpened
;
117 else if (eDeclaration
== BeforeDeclaration
)
119 nStartOfTagIndex
= i
;
126 // The string following '<' has to be a known HTML token.
127 OString aToken
= sHeader
.copy( nStartOfTagIndex
, i
- nStartOfTagIndex
);
128 return GetHTMLToken( OStringToOUString( aToken
.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US
) ) != HtmlTokenId::NONE
;
132 PlainTextFilterDetect::PlainTextFilterDetect() {}
134 PlainTextFilterDetect::~PlainTextFilterDetect() {}
136 OUString SAL_CALL
PlainTextFilterDetect::detect(uno::Sequence
<beans::PropertyValue
>& lDescriptor
)
138 MediaDescriptor
aMediaDesc(lDescriptor
);
140 OUString aType
= aMediaDesc
.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME
, OUString() );
141 OUString aDocService
= aMediaDesc
.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE
, OUString() );
143 if ((aType
== "generic_HTML") || (aType
== "calc_HTML"))
145 uno::Reference
<io::XInputStream
> xInStream(aMediaDesc
[MediaDescriptor::PROP_INPUTSTREAM
], uno::UNO_QUERY
);
146 if (!xInStream
.is() || !IsHTMLStream(xInStream
))
149 if ((aDocService
== CALC_DOCSERVICE
) || (aType
== "calc_HTML"))
150 aMediaDesc
[MediaDescriptor::PROP_FILTERNAME
] <<= OUString(CALC_HTML_FILTER
);
151 else if (aDocService
== WRITER_DOCSERVICE
)
152 aMediaDesc
[MediaDescriptor::PROP_FILTERNAME
] <<= OUString(WRITER_HTML_FILTER
);
154 aMediaDesc
[MediaDescriptor::PROP_FILTERNAME
] <<= OUString(WEB_HTML_FILTER
);
157 else if (aType
== "generic_Text")
159 uno::Reference
<io::XStream
> xStream(aMediaDesc
[MediaDescriptor::PROP_STREAM
], uno::UNO_QUERY
);
160 uno::Reference
<io::XInputStream
> xInStream(aMediaDesc
[MediaDescriptor::PROP_INPUTSTREAM
], uno::UNO_QUERY
);
161 if (xStream
.is() || xInStream
.is())
164 std::unique_ptr
<SvStream
> pInStream
;
166 pInStream
= utl::UcbStreamHelper::CreateStream(xStream
);
168 pInStream
= utl::UcbStreamHelper::CreateStream(xInStream
);
169 std::unique_ptr
<SvMemoryStream
> pDecompressedStream(new SvMemoryStream());
170 if (aCodecGZ
.AttemptDecompression(*pInStream
, *pDecompressedStream
))
172 uno::Reference
<io::XStream
> xStreamDecompressed(new utl::OStreamWrapper(std::move(pDecompressedStream
)));
173 aMediaDesc
[MediaDescriptor::PROP_STREAM
] <<= xStreamDecompressed
;
174 aMediaDesc
[MediaDescriptor::PROP_INPUTSTREAM
] <<= xStreamDecompressed
->getInputStream();
175 OUString aURL
= aMediaDesc
.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL
, OUString() );
176 sal_Int32 nIdx
= aURL
.lastIndexOf(".gz");
178 aMediaDesc
[MediaDescriptor::PROP_URL
] <<= aURL
.copy(0, nIdx
);
181 // Get the file name extension.
182 INetURLObject
aParser(aMediaDesc
.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL
, OUString() ) );
183 OUString aExt
= aParser
.getExtension(INetURLObject::LAST_SEGMENT
, true, INetURLObject::DecodeMechanism::WithCharset
);
184 aExt
= aExt
.toAsciiLowerCase();
185 OUString aName
= aParser
.getName().toAsciiLowerCase();
187 // Decide which filter to use based on the document service first,
188 // then on extension if that's not available.
190 if (aDocService
== CALC_DOCSERVICE
)
191 aMediaDesc
[MediaDescriptor::PROP_FILTERNAME
] <<= CALC_TEXT_FILTER
;
192 else if (aDocService
== WRITER_DOCSERVICE
)
193 aMediaDesc
[MediaDescriptor::PROP_FILTERNAME
] <<= WRITER_TEXT_FILTER
;
194 else if (aExt
== "csv" || aExt
== "tsv" || aExt
== "tab" || aExt
== "xls" || aName
.endsWith(".csv.gz"))
195 aMediaDesc
[MediaDescriptor::PROP_FILTERNAME
] <<= CALC_TEXT_FILTER
;
197 aMediaDesc
[MediaDescriptor::PROP_FILTERNAME
] <<= WRITER_TEXT_FILTER
;
201 // Nothing to detect.
204 aMediaDesc
>> lDescriptor
;
210 void SAL_CALL
PlainTextFilterDetect::initialize(const uno::Sequence
<uno::Any
>& /*aArguments*/)
214 OUString
PlainTextFilterDetect_getImplementationName()
216 return u
"com.sun.star.comp.filters.PlainTextFilterDetect"_ustr
;
219 uno::Sequence
<OUString
> PlainTextFilterDetect_getSupportedServiceNames()
221 return { u
"com.sun.star.document.ExtendedTypeDetection"_ustr
, u
"com.sun.star.comp.filters.PlainTextFilterDetect"_ustr
};
225 OUString SAL_CALL
PlainTextFilterDetect::getImplementationName()
227 return PlainTextFilterDetect_getImplementationName();
230 sal_Bool SAL_CALL
PlainTextFilterDetect::supportsService(const OUString
& rServiceName
)
232 return cppu::supportsService(this, rServiceName
);
235 uno::Sequence
<OUString
> SAL_CALL
PlainTextFilterDetect::getSupportedServiceNames()
237 return PlainTextFilterDetect_getSupportedServiceNames();
240 extern "C" SAL_DLLPUBLIC_EXPORT
css::uno::XInterface
*
241 com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(css::uno::XComponentContext
* ,
242 css::uno::Sequence
<css::uno::Any
> const &)
244 return cppu::acquire(new PlainTextFilterDetect
);
247 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */