1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 #include "htmldataprovider.hxx"
11 #include <datamapper.hxx>
12 #include <datatransformation.hxx>
13 #include <salhelper/thread.hxx>
15 #include <vcl/svapp.hxx>
16 #include <tools/stream.hxx>
18 #include <libxml/HTMLparser.h>
20 #include <libxml/xpath.h>
22 #include <comphelper/string.hxx>
26 class HTMLFetchThread
: public salhelper::Thread
28 ScDocument
& mrDocument
;
31 const std::vector
<std::shared_ptr
<sc::DataTransformation
>> maDataTransformations
;
32 std::function
<void()> maImportFinishedHdl
;
34 void handleTable(xmlNodePtr pTable
);
35 void handleRow(xmlNodePtr pRow
, SCROW nRow
);
36 void skipHeadBody(xmlNodePtr pSkip
, SCROW
& rRow
);
37 void handleCell(xmlNodePtr pCell
, SCROW nRow
, SCCOL nCol
);
40 HTMLFetchThread(ScDocument
& rDoc
, const OUString
&, const OUString
& rID
, std::function
<void()> aImportFinishedHdl
,
41 std::vector
<std::shared_ptr
<sc::DataTransformation
>>&& rTransformations
);
43 virtual void execute() override
;
46 HTMLFetchThread::HTMLFetchThread(
47 ScDocument
& rDoc
, const OUString
& rURL
, const OUString
& rID
,
48 std::function
<void()> aImportFinishedHdl
,
49 std::vector
<std::shared_ptr
<sc::DataTransformation
>>&& rTransformations
)
50 : salhelper::Thread("HTML Fetch Thread")
54 , maDataTransformations(std::move(rTransformations
))
55 , maImportFinishedHdl(std::move(aImportFinishedHdl
))
61 OString
toString(const xmlChar
* pStr
)
63 return OString(reinterpret_cast<const char*>(pStr
), xmlStrlen(pStr
));
66 OUString
trim_string(const OUString
& aStr
)
69 OUString aString
= aStr
;
73 aString
= comphelper::string::strip(aString
, ' ');
74 aString
= comphelper::string::strip(aString
, '\n');
75 aString
= comphelper::string::strip(aString
, '\r');
76 aString
= comphelper::string::strip(aString
, '\t');
78 while (aOldString
!= aString
);
83 OUString
get_node_str(xmlNodePtr pNode
)
86 for (xmlNodePtr cur_node
= pNode
->children
; cur_node
; cur_node
= cur_node
->next
)
88 if (cur_node
->type
== XML_TEXT_NODE
)
90 OUString aString
= OStringToOUString(toString(cur_node
->content
), RTL_TEXTENCODING_UTF8
);
91 aStr
.append(trim_string(aString
));
93 else if (cur_node
->type
== XML_ELEMENT_NODE
)
95 aStr
.append(get_node_str(cur_node
));
99 return aStr
.makeStringAndClear();
104 void HTMLFetchThread::handleCell(xmlNodePtr pCellNode
, SCROW nRow
, SCCOL nCol
)
107 for (xmlNodePtr cur_node
= pCellNode
->children
; cur_node
; cur_node
= cur_node
->next
)
109 if (cur_node
->type
== XML_TEXT_NODE
)
111 OUString aString
= OStringToOUString(toString(cur_node
->content
), RTL_TEXTENCODING_UTF8
);
112 aStr
.append(trim_string(aString
));
114 else if (cur_node
->type
== XML_ELEMENT_NODE
)
116 aStr
.append(get_node_str(cur_node
));
122 OUString aCellStr
= aStr
.makeStringAndClear();
123 mrDocument
.SetString(nCol
, nRow
, 0, aCellStr
);
127 void HTMLFetchThread::handleRow(xmlNodePtr pRowNode
, SCROW nRow
)
130 for (xmlNodePtr cur_node
= pRowNode
->children
; cur_node
; cur_node
= cur_node
->next
)
132 if (cur_node
->type
== XML_ELEMENT_NODE
)
134 OString aNodeName
= toString(cur_node
->name
);
135 if (aNodeName
== "td" || aNodeName
== "th")
137 handleCell(cur_node
, nRow
, nCol
);
144 void HTMLFetchThread::skipHeadBody(xmlNodePtr pSkipElement
, SCROW
& rRow
)
146 for (xmlNodePtr cur_node
= pSkipElement
->children
; cur_node
; cur_node
= cur_node
->next
)
148 if (cur_node
->type
== XML_ELEMENT_NODE
)
150 OString aNodeName
= toString(cur_node
->name
);
151 if (aNodeName
== "tr")
153 handleRow(cur_node
, rRow
);
161 void HTMLFetchThread::handleTable(xmlNodePtr pTable
)
164 for (xmlNodePtr cur_node
= pTable
->children
; cur_node
; cur_node
= cur_node
->next
)
166 if (cur_node
->type
== XML_ELEMENT_NODE
)
168 OString aNodeName
= toString(cur_node
->name
);
169 if (aNodeName
== "tr")
171 handleRow(cur_node
, nRow
);
174 else if (aNodeName
== "thead" || aNodeName
== "tbody")
176 skipHeadBody(cur_node
, nRow
);
182 void HTMLFetchThread::execute()
184 OStringBuffer
aBuffer(64000);
185 DataProvider::FetchStreamFromURL(maURL
, aBuffer
);
187 if (aBuffer
.isEmpty())
190 htmlDocPtr pHtmlPtr
= htmlParseDoc(reinterpret_cast<xmlChar
*>(const_cast<char*>(aBuffer
.getStr())), nullptr);
192 OString aID
= OUStringToOString(maID
, RTL_TEXTENCODING_UTF8
);
193 xmlXPathContextPtr pXmlXpathCtx
= xmlXPathNewContext(pHtmlPtr
);
194 xmlXPathObjectPtr pXmlXpathObj
= xmlXPathEvalExpression(BAD_CAST(aID
.getStr()), pXmlXpathCtx
);
198 xmlXPathFreeContext(pXmlXpathCtx
);
201 xmlNodeSetPtr pXmlNodes
= pXmlXpathObj
->nodesetval
;
205 xmlXPathFreeNodeSetList(pXmlXpathObj
);
206 xmlXPathFreeContext(pXmlXpathCtx
);
210 if (pXmlNodes
->nodeNr
== 0)
212 xmlXPathFreeNodeSet(pXmlNodes
);
213 xmlXPathFreeNodeSetList(pXmlXpathObj
);
214 xmlXPathFreeContext(pXmlXpathCtx
);
218 xmlNodePtr pNode
= pXmlNodes
->nodeTab
[0];
221 xmlXPathFreeNodeSet(pXmlNodes
);
222 xmlXPathFreeNodeSetList(pXmlXpathObj
);
223 xmlXPathFreeContext(pXmlXpathCtx
);
225 for (auto& itr
: maDataTransformations
)
227 itr
->Transform(mrDocument
);
230 SolarMutexGuard aGuard
;
231 maImportFinishedHdl();
234 HTMLDataProvider::HTMLDataProvider(ScDocument
* pDoc
, sc::ExternalDataSource
& rDataSource
):
235 DataProvider(rDataSource
),
240 HTMLDataProvider::~HTMLDataProvider()
242 if (mxHTMLFetchThread
.is())
244 SolarMutexReleaser aReleaser
;
245 mxHTMLFetchThread
->join();
249 void HTMLDataProvider::Import()
251 // already importing data
255 mpDoc
.reset(new ScDocument(SCDOCMODE_CLIP
));
256 mpDoc
->ResetClip(mpDocument
, SCTAB(0));
257 mxHTMLFetchThread
= new HTMLFetchThread(*mpDoc
, mrDataSource
.getURL(), mrDataSource
.getID(),
258 std::bind(&HTMLDataProvider::ImportFinished
, this), std::vector(mrDataSource
.getDataTransformation()));
259 mxHTMLFetchThread
->launch();
263 SolarMutexReleaser aReleaser
;
264 mxHTMLFetchThread
->join();
268 void HTMLDataProvider::ImportFinished()
270 mrDataSource
.getDBManager()->WriteToDoc(*mpDoc
);
273 const OUString
& HTMLDataProvider::GetURL() const
275 return mrDataSource
.getURL();
280 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */