Update git submodules
[LibreOffice.git] / filter / source / textfilterdetect / filterdetect.cxx
blobc74e11b8a3a1d07c10314138873ca9f75a1c7fda
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #include "filterdetect.hxx"
12 #include <svtools/htmltokn.h>
13 #include <tools/urlobj.hxx>
14 #include <tools/zcodec.hxx>
15 #include <ucbhelper/content.hxx>
16 #include <unotools/mediadescriptor.hxx>
17 #include <unotools/streamwrap.hxx>
18 #include <unotools/ucbstreamhelper.hxx>
20 #include <com/sun/star/io/XInputStream.hpp>
21 #include <cppuhelper/supportsservice.hxx>
22 #include <memory>
24 constexpr OUString WRITER_TEXT_FILTER = u"Text"_ustr;
25 constexpr OUString CALC_TEXT_FILTER = u"Text - txt - csv (StarCalc)"_ustr;
27 constexpr OUStringLiteral WEB_HTML_FILTER = u"HTML";
28 constexpr OUStringLiteral WRITER_HTML_FILTER = u"HTML (StarWriter)";
29 constexpr OUStringLiteral CALC_HTML_FILTER = u"calc_HTML_WebQuery";
31 constexpr OUString WRITER_DOCSERVICE = u"com.sun.star.text.TextDocument"_ustr;
32 constexpr OUString CALC_DOCSERVICE = u"com.sun.star.sheet.SpreadsheetDocument"_ustr;
34 using namespace ::com::sun::star;
35 using utl::MediaDescriptor;
37 namespace {
39 bool IsHTMLStream( const uno::Reference<io::XInputStream>& xInStream )
41 std::unique_ptr<SvStream> pInStream( utl::UcbStreamHelper::CreateStream( xInStream ) );
42 if ( !pInStream || pInStream->GetError() )
43 // No stream
44 return false;
46 // Read the stream header
47 pInStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW );
48 const sal_uInt64 nUniPos = pInStream->Tell();
49 const sal_uInt16 nSize = 4096;
51 OString sHeader;
52 if ( nUniPos == 3 || nUniPos == 0 ) // UTF-8 or non-Unicode
53 sHeader = read_uInt8s_ToOString( *pInStream, nSize );
54 else // UTF-16 (nUniPos = 2)
55 sHeader = OUStringToOString( read_uInt16s_ToOUString( *pInStream, nSize ), RTL_TEXTENCODING_ASCII_US );
57 // Now check whether the stream begins with a known HTML tag.
58 enum DetectPhase { BeforeTag, TagOpened, InTagName };
59 DetectPhase dp = BeforeTag;
60 /// BeforeDeclaration -> ? -> DeclarationOpened -> > -> BeforeDeclaration.
61 enum DeclarationPhase
63 BeforeDeclaration,
64 DeclarationOpened
66 DeclarationPhase eDeclaration = BeforeDeclaration;
68 const char* pHeader = sHeader.getStr();
69 const int nLength = sHeader.getLength();
70 int i = 0, nStartOfTagIndex = 0;
72 for ( i = 0; i < nLength; ++i, ++pHeader )
74 char c = *pHeader;
75 if ((c == ' ' || c == '\n' || c == '\t' || c == '\r' || c == '\f')
76 && eDeclaration == BeforeDeclaration)
78 if ( dp == TagOpened )
79 return false; // Invalid: Should start with a tag name
80 else if ( dp == InTagName )
81 break; // End of tag name reached
83 else if ( c == '<' )
85 if ( dp == BeforeTag )
86 dp = TagOpened;
87 else
88 return false; // Invalid: Nested '<'
90 else if ( c == '>' )
92 if ( dp == InTagName )
93 break; // End of tag name reached
94 else if (eDeclaration == DeclarationOpened)
96 dp = BeforeTag;
97 eDeclaration = BeforeDeclaration;
99 else
100 return false; // Invalid: Empty tag or before '<'
102 else if ( c == '!' )
104 if ( dp == TagOpened )
105 return true; // "<!" - DOCTYPE or comments block
106 else
107 return false; // Invalid: '!' before '<' or inside tag name
109 else
111 if ( dp == BeforeTag )
112 return false; // Invalid: Should start with a tag
113 else if ( dp == TagOpened )
115 if (c == '?' && eDeclaration == BeforeDeclaration)
116 eDeclaration = DeclarationOpened;
117 else if (eDeclaration == BeforeDeclaration)
119 nStartOfTagIndex = i;
120 dp = InTagName;
126 // The string following '<' has to be a known HTML token.
127 OString aToken = sHeader.copy( nStartOfTagIndex, i - nStartOfTagIndex );
128 return GetHTMLToken( OStringToOUString( aToken.toAsciiLowerCase(), RTL_TEXTENCODING_ASCII_US ) ) != HtmlTokenId::NONE;
132 PlainTextFilterDetect::PlainTextFilterDetect() {}
134 PlainTextFilterDetect::~PlainTextFilterDetect() {}
136 OUString SAL_CALL PlainTextFilterDetect::detect(uno::Sequence<beans::PropertyValue>& lDescriptor)
138 MediaDescriptor aMediaDesc(lDescriptor);
140 OUString aType = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_TYPENAME, OUString() );
141 OUString aDocService = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_DOCUMENTSERVICE, OUString() );
143 if ((aType == "generic_HTML") || (aType == "calc_HTML"))
145 uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY);
146 if (!xInStream.is() || !IsHTMLStream(xInStream))
147 return OUString();
149 if ((aDocService == CALC_DOCSERVICE) || (aType == "calc_HTML"))
150 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(CALC_HTML_FILTER);
151 else if (aDocService == WRITER_DOCSERVICE)
152 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WRITER_HTML_FILTER);
153 else
154 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= OUString(WEB_HTML_FILTER);
157 else if (aType == "generic_Text")
159 uno::Reference<io::XStream> xStream(aMediaDesc[MediaDescriptor::PROP_STREAM], uno::UNO_QUERY);
160 uno::Reference<io::XInputStream> xInStream(aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM], uno::UNO_QUERY);
161 if (xStream.is() || xInStream.is())
163 ZCodec aCodecGZ;
164 std::unique_ptr<SvStream> pInStream;
165 if (xStream.is())
166 pInStream = utl::UcbStreamHelper::CreateStream(xStream);
167 else
168 pInStream = utl::UcbStreamHelper::CreateStream(xInStream);
169 std::unique_ptr<SvMemoryStream> pDecompressedStream(new SvMemoryStream());
170 if (aCodecGZ.AttemptDecompression(*pInStream, *pDecompressedStream))
172 uno::Reference<io::XStream> xStreamDecompressed(new utl::OStreamWrapper(std::move(pDecompressedStream)));
173 aMediaDesc[MediaDescriptor::PROP_STREAM] <<= xStreamDecompressed;
174 aMediaDesc[MediaDescriptor::PROP_INPUTSTREAM] <<= xStreamDecompressed->getInputStream();
175 OUString aURL = aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() );
176 sal_Int32 nIdx = aURL.lastIndexOf(".gz");
177 if (nIdx != -1)
178 aMediaDesc[MediaDescriptor::PROP_URL] <<= aURL.copy(0, nIdx);
181 // Get the file name extension.
182 INetURLObject aParser(aMediaDesc.getUnpackedValueOrDefault(MediaDescriptor::PROP_URL, OUString() ) );
183 OUString aExt = aParser.getExtension(INetURLObject::LAST_SEGMENT, true, INetURLObject::DecodeMechanism::WithCharset);
184 aExt = aExt.toAsciiLowerCase();
185 OUString aName = aParser.getName().toAsciiLowerCase();
187 // Decide which filter to use based on the document service first,
188 // then on extension if that's not available.
190 if (aDocService == CALC_DOCSERVICE)
191 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= CALC_TEXT_FILTER;
192 else if (aDocService == WRITER_DOCSERVICE)
193 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= WRITER_TEXT_FILTER;
194 else if (aExt == "csv" || aExt == "tsv" || aExt == "tab" || aExt == "xls" || aName.endsWith(".csv.gz"))
195 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= CALC_TEXT_FILTER;
196 else
197 aMediaDesc[MediaDescriptor::PROP_FILTERNAME] <<= WRITER_TEXT_FILTER;
200 else
201 // Nothing to detect.
202 return OUString();
204 aMediaDesc >> lDescriptor;
205 return aType;
208 // XInitialization
210 void SAL_CALL PlainTextFilterDetect::initialize(const uno::Sequence<uno::Any>& /*aArguments*/)
214 OUString PlainTextFilterDetect_getImplementationName()
216 return u"com.sun.star.comp.filters.PlainTextFilterDetect"_ustr;
219 uno::Sequence<OUString> PlainTextFilterDetect_getSupportedServiceNames()
221 return { u"com.sun.star.document.ExtendedTypeDetection"_ustr, u"com.sun.star.comp.filters.PlainTextFilterDetect"_ustr };
224 // XServiceInfo
225 OUString SAL_CALL PlainTextFilterDetect::getImplementationName()
227 return PlainTextFilterDetect_getImplementationName();
230 sal_Bool SAL_CALL PlainTextFilterDetect::supportsService(const OUString& rServiceName)
232 return cppu::supportsService(this, rServiceName);
235 uno::Sequence<OUString> SAL_CALL PlainTextFilterDetect::getSupportedServiceNames()
237 return PlainTextFilterDetect_getSupportedServiceNames();
240 extern "C" SAL_DLLPUBLIC_EXPORT css::uno::XInterface*
241 com_sun_star_comp_filters_PlainTextFilterDetect_get_implementation(css::uno::XComponentContext* ,
242 css::uno::Sequence<css::uno::Any> const &)
244 return cppu::acquire(new PlainTextFilterDetect);
247 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */