1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 * This file incorporates work covered by the following license notice:
11 * Licensed to the Apache Software Foundation (ASF) under one or more
12 * contributor license agreements. See the NOTICE file distributed
13 * with this work for additional information regarding copyright
14 * ownership. The ASF licenses this file to you under the Apache
15 * License, Version 2.0 (the "License"); you may not use this file
16 * except in compliance with the License. You may obtain a copy of
17 * the License at http://www.apache.org/licenses/LICENSE-2.0 .
20 #include "HtmlFmtFlt.hxx"
22 #include <rtl/string.h>
23 #include <osl/diagnose.h>
31 using namespace com::sun::star::uno
;
33 // converts the openoffice text/html clipboard format to the HTML Format
34 // well known under MS Windows
35 // the MS HTML Format has a header before the real html data
37 // Version:1.0 Version number of the clipboard. Staring is 0.9
38 // StartHTML: Byte count from the beginning of the clipboard to the start
39 // of the context, or -1 if no context
40 // EndHTML: Byte count from the beginning of the clipboard to the end
41 // of the context, or -1 if no context
42 // StartFragment: Byte count from the beginning of the clipboard to the
43 // start of the fragment
44 // EndFragment: Byte count from the beginning of the clipboard to the
45 // end of the fragment
46 // StartSelection: Byte count from the beginning of the clipboard to the
47 // start of the selection
48 // EndSelection: Byte count from the beginning of the clipboard to the
49 // end of the selection
51 // StartSelection and EndSelection are optional
52 // The fragment should be preceded and followed by the HTML comments
53 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
58 std::string
GetHtmlFormatHeader(size_t startHtml
, size_t endHtml
, size_t startFragment
, size_t endFragment
)
60 std::ostringstream htmlHeader
;
61 htmlHeader
<< "Version:1.0" << '\r' << '\n';
62 htmlHeader
<< "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec
<< startHtml
<< '\r' << '\n';
63 htmlHeader
<< "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec
<< endHtml
<< '\r' << '\n';
64 htmlHeader
<< "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec
<< startFragment
<< '\r' << '\n';
65 htmlHeader
<< "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec
<< endFragment
<< '\r' << '\n';
66 return htmlHeader
.str();
71 // the office always writes the start and end html tag in upper cases and
72 // without spaces both tags don't allow parameters
73 const std::string TAG_HTML
= std::string("<html>");
74 const std::string TAG_END_HTML
= std::string("</html>");
76 // The body tag may have parameters so we need to search for the
77 // closing '>' manually e.g. <BODY param> #92840#
78 const std::string TAG_BODY
= std::string("<body");
79 const std::string TAG_END_BODY
= std::string("</body");
81 Sequence
<sal_Int8
> SAL_CALL
TextHtmlToHTMLFormat(Sequence
<sal_Int8
>& aTextHtml
)
83 OSL_ASSERT(aTextHtml
.getLength() > 0);
85 if (!(aTextHtml
.getLength() > 0))
86 return Sequence
<sal_Int8
>();
88 // fill the buffer with dummy values to calc the exact length
89 std::string dummyHtmlHeader
= GetHtmlFormatHeader(0, 0, 0, 0);
90 size_t lHtmlFormatHeader
= dummyHtmlHeader
.length();
93 reinterpret_cast<const sal_Char
*>(aTextHtml
.getConstArray()),
94 reinterpret_cast<const sal_Char
*>(aTextHtml
.getConstArray()) + aTextHtml
.getLength());
96 std::string::size_type nStartHtml
= textHtml
.find(TAG_HTML
) + lHtmlFormatHeader
- 1; // we start one before '<HTML>' Word 2000 does also so
97 std::string::size_type nEndHtml
= textHtml
.find(TAG_END_HTML
) + lHtmlFormatHeader
+ TAG_END_HTML
.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
99 // The body tag may have parameters so we need to search for the
100 // closing '>' manually e.g. <BODY param> #92840#
101 std::string::size_type nStartFragment
= textHtml
.find(">", textHtml
.find(TAG_BODY
)) + lHtmlFormatHeader
+ 1;
102 std::string::size_type nEndFragment
= textHtml
.find(TAG_END_BODY
) + lHtmlFormatHeader
;
104 std::string htmlFormat
= GetHtmlFormatHeader(nStartHtml
, nEndHtml
, nStartFragment
, nEndFragment
);
105 htmlFormat
+= textHtml
;
107 Sequence
<sal_Int8
> byteSequence(htmlFormat
.length() + 1); // space the trailing '\0'
108 memset(byteSequence
.getArray(), 0, byteSequence
.getLength());
111 static_cast<void*>(byteSequence
.getArray()),
112 static_cast<const void*>(htmlFormat
.c_str()),
113 htmlFormat
.length());
118 const char* const HtmlStartTag
= "<html";
120 Sequence
<sal_Int8
> HTMLFormatToTextHtml(const Sequence
<sal_Int8
>& aHTMLFormat
)
122 assert(isHTMLFormat(aHTMLFormat
) && "No HTML Format provided");
124 Sequence
<sal_Int8
>& nonconstHTMLFormatRef
= const_cast< Sequence
<sal_Int8
>& >(aHTMLFormat
);
125 sal_Char
* dataStart
= reinterpret_cast<sal_Char
*>(nonconstHTMLFormatRef
.getArray());
126 sal_Char
* dataEnd
= dataStart
+ nonconstHTMLFormatRef
.getLength() - 1;
127 const sal_Char
* htmlStartTag
= strcasestr(dataStart
, HtmlStartTag
);
129 assert(htmlStartTag
&& "Seems to be no HTML at all");
131 // It doesn't seem to be HTML? Well then simply return what has been
132 // provided in non-debug builds
133 if (htmlStartTag
== nullptr)
138 sal_Int32 len
= dataEnd
- htmlStartTag
;
139 Sequence
<sal_Int8
> plainHtmlData(len
);
141 memcpy(static_cast<void*>(plainHtmlData
.getArray()), htmlStartTag
, len
);
143 return plainHtmlData
;
146 /* A simple format detection. We are just comparing the first few bytes
147 of the provided byte sequence to see whether or not it is the MS
148 Office Html format. If it shows that this is not reliable enough we
151 const char HtmlFormatStart
[] = "Version:";
152 int const HtmlFormatStartLen
= (sizeof(HtmlFormatStart
) - 1);
154 bool isHTMLFormat(const Sequence
<sal_Int8
>& aHtmlSequence
)
156 if (aHtmlSequence
.getLength() < HtmlFormatStartLen
)
159 return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart
,
161 reinterpret_cast<const sal_Char
*>(aHtmlSequence
.getConstArray()),
162 HtmlFormatStartLen
) == 0;
165 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */