third_party/WebKit/Source/wtf/text/TextEncoding.cpp

   1 /*
   2  * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
   3  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
   4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
  16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
  19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26  */
  27
  28 #include "config.h"
  29 #include "wtf/text/TextEncoding.h"
  30
  31 #include "wtf/text/TextEncodingRegistry.h"
  32 #include <unicode/unorm.h>
  33 #include "wtf/OwnPtr.h"
  34 #include "wtf/StdLibExtras.h"
  35 #include "wtf/Threading.h"
  36 #include "wtf/text/CString.h"
  37 #include "wtf/text/WTFString.h"
  38
  39 namespace WTF {
  40
  41 static const TextEncoding& UTF7Encoding()
  42 {
  43     AtomicallyInitializedStaticReference(const TextEncoding, globalUTF7Encoding, new TextEncoding("UTF-7"));
  44     return globalUTF7Encoding;
  45 }
  46
  47 TextEncoding::TextEncoding(const char* name)
  48     : m_name(atomicCanonicalTextEncodingName(name))
  49 {
  50     // Aliases are valid, but not "replacement" itself.
  51     if (m_name && isReplacementEncoding(name))
  52         m_name = 0;
  53 }
  54
  55 TextEncoding::TextEncoding(const String& name)
  56     : m_name(atomicCanonicalTextEncodingName(name))
  57 {
  58     // Aliases are valid, but not "replacement" itself.
  59     if (m_name && isReplacementEncoding(name))
  60         m_name = 0;
  61 }
  62
  63 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
  64 {
  65     if (!m_name)
  66         return String();
  67
  68     return newTextCodec(*this)->decode(data, length, DataEOF, stopOnError, sawError);
  69 }
  70
  71 CString TextEncoding::encode(const String& string, UnencodableHandling handling) const
  72 {
  73     if (!m_name)
  74         return CString();
  75
  76     if (string.isEmpty())
  77         return "";
  78
  79     OwnPtr<TextCodec> textCodec = newTextCodec(*this);
  80     CString encodedString;
  81     if (string.is8Bit())
  82         encodedString = textCodec->encode(string.characters8(), string.length(), handling);
  83     else
  84         encodedString = textCodec->encode(string.characters16(), string.length(), handling);
  85     return encodedString;
  86 }
  87
  88 CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
  89 {
  90     if (!m_name)
  91         return CString();
  92
  93     if (string.isEmpty())
  94         return "";
  95
  96     // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
  97     // unaffected by NFC. This is effectively the same as saying that all
  98     // Latin-1 text is already normalized to NFC.
  99     // Source: http://unicode.org/reports/tr15/
 100     if (string.is8Bit())
 101         return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);
 102
 103     const UChar* source = string.characters16();
 104     size_t length = string.length();
 105
 106     Vector<UChar> normalizedCharacters;
 107
 108     UErrorCode err = U_ZERO_ERROR;
 109     if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
 110         // First try using the length of the original string, since normalization to NFC rarely increases length.
 111         normalizedCharacters.grow(length);
 112         int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
 113         if (err == U_BUFFER_OVERFLOW_ERROR) {
 114             err = U_ZERO_ERROR;
 115             normalizedCharacters.resize(normalizedLength);
 116             normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
 117         }
 118         ASSERT(U_SUCCESS(err));
 119
 120         source = normalizedCharacters.data();
 121         length = normalizedLength;
 122     }
 123
 124     return newTextCodec(*this)->encode(source, length, handling);
 125 }
 126
 127 bool TextEncoding::usesVisualOrdering() const
 128 {
 129     if (noExtendedTextEncodingNameUsed())
 130         return false;
 131
 132     static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
 133     return m_name == a;
 134 }
 135
 136 bool TextEncoding::isNonByteBasedEncoding() const
 137 {
 138     if (noExtendedTextEncodingNameUsed()) {
 139         return *this == UTF16LittleEndianEncoding()
 140             || *this == UTF16BigEndianEncoding();
 141     }
 142
 143     return *this == UTF16LittleEndianEncoding()
 144         || *this == UTF16BigEndianEncoding()
 145         || *this == UTF32BigEndianEncoding()
 146         || *this == UTF32LittleEndianEncoding();
 147 }
 148
 149 bool TextEncoding::isUTF7Encoding() const
 150 {
 151     if (noExtendedTextEncodingNameUsed())
 152         return false;
 153
 154     return *this == UTF7Encoding();
 155 }
 156
 157 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
 158 {
 159     if (isNonByteBasedEncoding())
 160         return UTF8Encoding();
 161     return *this;
 162 }
 163
 164 // HTML5 specifies that UTF-8 be used in form submission when a form is
 165 // is a part of a document in UTF-16 probably because UTF-16 is not a
 166 // byte-based encoding and can contain 0x00. By extension, the same
 167 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
 168 // but it's fraught with problems and we'd rather steer clear of it.
 169 const TextEncoding& TextEncoding::encodingForFormSubmission() const
 170 {
 171     if (isNonByteBasedEncoding() || isUTF7Encoding())
 172         return UTF8Encoding();
 173     return *this;
 174 }
 175
 176 const TextEncoding& ASCIIEncoding()
 177 {
 178     AtomicallyInitializedStaticReference(const TextEncoding, globalASCIIEncoding, new TextEncoding("ASCII"));
 179     return globalASCIIEncoding;
 180 }
 181
 182 const TextEncoding& Latin1Encoding()
 183 {
 184     AtomicallyInitializedStaticReference(const TextEncoding, globalLatin1Encoding, new TextEncoding("latin1"));
 185     return globalLatin1Encoding;
 186 }
 187
 188 const TextEncoding& UTF16BigEndianEncoding()
 189 {
 190     AtomicallyInitializedStaticReference(const TextEncoding, globalUTF16BigEndianEncoding, new TextEncoding("UTF-16BE"));
 191     return globalUTF16BigEndianEncoding;
 192 }
 193
 194 const TextEncoding& UTF16LittleEndianEncoding()
 195 {
 196     AtomicallyInitializedStaticReference(const TextEncoding, globalUTF16LittleEndianEncoding, new TextEncoding("UTF-16LE"));
 197     return globalUTF16LittleEndianEncoding;
 198 }
 199
 200 const TextEncoding& UTF32BigEndianEncoding()
 201 {
 202     AtomicallyInitializedStaticReference(const TextEncoding, globalUTF32BigEndianEncoding, new TextEncoding("UTF-32BE"));
 203     return globalUTF32BigEndianEncoding;
 204 }
 205
 206 const TextEncoding& UTF32LittleEndianEncoding()
 207 {
 208     AtomicallyInitializedStaticReference(const TextEncoding, globalUTF32LittleEndianEncoding, new TextEncoding("UTF-32LE"));
 209     return globalUTF32LittleEndianEncoding;
 210 }
 211
 212 const TextEncoding& UTF8Encoding()
 213 {
 214     AtomicallyInitializedStaticReference(const TextEncoding, globalUTF8Encoding, new TextEncoding("UTF-8"));
 215     ASSERT(globalUTF8Encoding.isValid());
 216     return globalUTF8Encoding;
 217 }
 218
 219 const TextEncoding& WindowsLatin1Encoding()
 220 {
 221     AtomicallyInitializedStaticReference(const TextEncoding, globalWindowsLatin1Encoding, new TextEncoding("WinLatin1"));
 222     return globalWindowsLatin1Encoding;
 223 }
 224
 225 } // namespace WTF