vcl/aqua/source/dtrans/HtmlFmtFlt.cxx

   1 #include "HtmlFmtFlt.hxx"
   2
   3 #include <rtl/string.h>
   4
   5 #include <string>
   6 #include <sstream>
   7 #include <vector>
   8 #include <iomanip>
   9
  10 #include <boost/assert.hpp>
  11
  12 using namespace com::sun::star::uno;
  13
  14 //------------------------------------------------------------------------------
  15 // converts the openoffice text/html clipboard format to the HTML Format
  16 // well known under MS Windows
  17 // the MS HTML Format has a header before the real html data
  18 //
  19 // Version:1.0          Version number of the clipboard. Staring is 0.9
  20 // StartHTML:           Byte count from the beginning of the clipboard to the start
  21 //                                      of the context, or -1 if no context
  22 // EndHTML:                     Byte count from the beginning of the clipboard to the end
  23 //                                      of the context, or -1 if no context
  24 // StartFragment:       Byte count from the beginning of the clipboard to the
  25 //                                      start of the fragment
  26 // EndFragment:         Byte count from the beginning of the clipboard to the
  27 //                                      end of the fragment
  28 // StartSelection:      Byte count from the beginning of the clipboard to the
  29 //                                      start of the selection
  30 // EndSelection:        Byte count from the beginning of the clipboard to the
  31 //                                      end of the selection
  32 //
  33 // StartSelection and EndSelection are optional
  34 // The fragment should be preceded and followed by the HTML comments
  35 // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
  36 // text
  37 //------------------------------------------------------------------------------
  38
  39 namespace // private
  40 {
  41 std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
  42 {
  43     std::ostringstream htmlHeader;
  44     htmlHeader << "Version:1.0" << '\r' << '\n';
  45     htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
  46     htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
  47     htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
  48     htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
  49     return htmlHeader.str();
  50 }
  51
  52 } // namespace private
  53
  54
  55 // the office allways writes the start and end html tag in upper cases and
  56 // without spaces both tags don't allow parameters
  57 const std::string TAG_HTML = std::string("<HTML>");
  58 const std::string TAG_END_HTML = std::string("</HTML>");
  59
  60 // The body tag may have parameters so we need to search for the
  61 // closing '>' manually e.g. <BODY param> #92840#
  62 const std::string TAG_BODY = std::string("<BODY");
  63 const std::string TAG_END_BODY = std::string("</BODY");
  64
  65 Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml)
  66 {
  67     OSL_ASSERT(aTextHtml.getLength() > 0);
  68
  69     if (!(aTextHtml.getLength() > 0))
  70         return Sequence<sal_Int8>();
  71
  72     // fill the buffer with dummy values to calc the exact length
  73     std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
  74     size_t lHtmlFormatHeader = dummyHtmlHeader.length();
  75
  76     std::string textHtml(
  77         reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()),
  78         reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());
  79
  80     std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
  81     std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
  82
  83     // The body tag may have parameters so we need to search for the
  84     // closing '>' manually e.g. <BODY param> #92840#
  85     std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
  86     std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
  87
  88     std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
  89     htmlFormat += textHtml;
  90
  91     Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
  92     rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength());
  93
  94     rtl_copyMemory(
  95         static_cast<void*>(byteSequence.getArray()),
  96         static_cast<const void*>(htmlFormat.c_str()),
  97         htmlFormat.length());
  98
  99     return byteSequence;
 100 }
 101
 102 const char* HtmlStartTag = "<html";
 103
 104 Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
 105 {
 106   BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
 107
 108   Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
 109   sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray());
 110   sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
 111   const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
 112
 113   BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all");
 114
 115   // It doesn't seem to be HTML? Well then simply return what has been
 116   // provided in non-debug builds
 117   if (htmlStartTag == NULL)
 118     {
 119     return aHTMLFormat;
 120     }
 121
 122   sal_Int32 len = dataEnd - htmlStartTag;
 123   Sequence<sal_Int8> plainHtmlData(len);
 124
 125   rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
 126
 127   return plainHtmlData;
 128 }
 129
 130 /* A simple format detection. We are just comparing the first few bytes
 131    of the provided byte sequence to see whether or not it is the MS
 132    Office Html format. If it shows that this is not reliable enough we
 133    can improve this
 134 */
 135 const char HtmlFormatStart[] = "Version:";
 136 int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1);
 137
 138 bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
 139 {
 140   if (aHtmlSequence.getLength() < HtmlFormatStartLen)
 141     return false;
 142
 143   return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
 144                                                    HtmlFormatStartLen,
 145                                                    reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()),
 146                                                    HtmlFormatStartLen) == 0;
 147 }