2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
3 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 #include "wtf/text/TextEncodingRegistry.h"
30 #include "wtf/ASCIICType.h"
31 #include "wtf/CurrentTime.h"
32 #include "wtf/HashMap.h"
33 #include "wtf/HashSet.h"
34 #include "wtf/MainThread.h"
35 #include "wtf/StdLibExtras.h"
36 #include "wtf/StringExtras.h"
37 #include "wtf/ThreadingPrimitives.h"
38 #include "wtf/text/CString.h"
39 #include "wtf/text/TextCodecICU.h"
40 #include "wtf/text/TextCodecLatin1.h"
41 #include "wtf/text/TextCodecReplacement.h"
42 #include "wtf/text/TextCodecUTF16.h"
43 #include "wtf/text/TextCodecUTF8.h"
44 #include "wtf/text/TextCodecUserDefined.h"
45 #include "wtf/text/TextEncoding.h"
49 const size_t maxEncodingNameLength
= 63;
51 // Hash for all-ASCII strings that does case folding.
52 struct TextEncodingNameHash
{
53 static bool equal(const char* s1
, const char* s2
)
58 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106
59 // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released.
60 // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only
61 c1
= toASCIILower(*s1
++);
62 c2
= toASCIILower(*s2
++);
68 if (toASCIILower(c1
) != toASCIILower(c2
))
75 // This algorithm is the one-at-a-time hash from:
76 // http://burtleburtle.net/bob/hash/hashfaq.html
77 // http://burtleburtle.net/bob/hash/doobs.html
78 static unsigned hash(const char* s
)
80 unsigned h
= WTF::stringHashingStartValue
;
95 static const bool safeToCompareToEmptyOrDeleted
= false;
98 struct TextCodecFactory
{
99 NewTextCodecFunction function
;
100 const void* additionalData
;
101 TextCodecFactory(NewTextCodecFunction f
= 0, const void* d
= 0) : function(f
), additionalData(d
) { }
104 typedef HashMap
<const char*, const char*, TextEncodingNameHash
> TextEncodingNameMap
;
105 typedef HashMap
<const char*, TextCodecFactory
> TextCodecMap
;
107 static Mutex
& encodingRegistryMutex()
109 // We don't have to use AtomicallyInitializedStatic here because
110 // this function is called on the main thread for any page before
111 // it is used in worker threads.
112 DEFINE_STATIC_LOCAL(Mutex
, mutex
, ());
116 static TextEncodingNameMap
* textEncodingNameMap
;
117 static TextCodecMap
* textCodecMap
;
120 static unsigned didExtendTextCodecMaps
= 0;
122 ALWAYS_INLINE
unsigned atomicDidExtendTextCodecMaps()
124 return acquireLoad(&didExtendTextCodecMaps
);
127 ALWAYS_INLINE
void atomicSetDidExtendTextCodemMaps()
129 releaseStore(&didExtendTextCodecMaps
, 1);
133 static const char textEncodingNameBlacklist
[][6] = { "UTF-7" };
137 static inline void checkExistingName(const char*, const char*) { }
141 static void checkExistingName(const char* alias
, const char* atomicName
)
143 const char* oldAtomicName
= textEncodingNameMap
->get(alias
);
146 if (oldAtomicName
== atomicName
)
148 // Keep the warning silent about one case where we know this will happen.
149 if (strcmp(alias
, "ISO-8859-8-I") == 0
150 && strcmp(oldAtomicName
, "ISO-8859-8-I") == 0
151 && strcasecmp(atomicName
, "iso-8859-8") == 0)
153 WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias
, oldAtomicName
, atomicName
);
158 static bool isUndesiredAlias(const char* alias
)
160 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
161 for (const char* p
= alias
; *p
; ++p
) {
165 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
166 // problem, see bug 43554.
167 if (0 == strcmp(alias
, "8859_1"))
172 static void addToTextEncodingNameMap(const char* alias
, const char* name
)
174 ASSERT(strlen(alias
) <= maxEncodingNameLength
);
175 if (isUndesiredAlias(alias
))
177 const char* atomicName
= textEncodingNameMap
->get(name
);
178 ASSERT(strcmp(alias
, name
) == 0 || atomicName
);
181 checkExistingName(alias
, atomicName
);
182 textEncodingNameMap
->add(alias
, atomicName
);
185 static void addToTextCodecMap(const char* name
, NewTextCodecFunction function
, const void* additionalData
)
187 const char* atomicName
= textEncodingNameMap
->get(name
);
189 textCodecMap
->add(atomicName
, TextCodecFactory(function
, additionalData
));
192 static void pruneBlacklistedCodecs()
194 for (size_t i
= 0; i
< WTF_ARRAY_LENGTH(textEncodingNameBlacklist
); ++i
) {
195 const char* atomicName
= textEncodingNameMap
->get(textEncodingNameBlacklist
[i
]);
199 Vector
<const char*> names
;
200 TextEncodingNameMap::const_iterator it
= textEncodingNameMap
->begin();
201 TextEncodingNameMap::const_iterator end
= textEncodingNameMap
->end();
202 for (; it
!= end
; ++it
) {
203 if (it
->value
== atomicName
)
204 names
.append(it
->key
);
207 textEncodingNameMap
->removeAll(names
);
209 textCodecMap
->remove(atomicName
);
213 static void buildBaseTextCodecMaps()
215 ASSERT(isMainThread());
216 ASSERT(!textCodecMap
);
217 ASSERT(!textEncodingNameMap
);
219 textCodecMap
= new TextCodecMap
;
220 textEncodingNameMap
= new TextEncodingNameMap
;
222 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap
);
223 TextCodecLatin1::registerCodecs(addToTextCodecMap
);
225 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap
);
226 TextCodecUTF8::registerCodecs(addToTextCodecMap
);
228 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap
);
229 TextCodecUTF16::registerCodecs(addToTextCodecMap
);
231 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap
);
232 TextCodecUserDefined::registerCodecs(addToTextCodecMap
);
235 bool isReplacementEncoding(const char* alias
)
237 return alias
&& !strcasecmp(alias
, "replacement");
240 bool isReplacementEncoding(const String
& alias
)
242 return alias
== "replacement";
245 static void extendTextCodecMaps()
247 TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap
);
248 TextCodecReplacement::registerCodecs(addToTextCodecMap
);
250 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap
);
251 TextCodecICU::registerCodecs(addToTextCodecMap
);
253 pruneBlacklistedCodecs();
256 PassOwnPtr
<TextCodec
> newTextCodec(const TextEncoding
& encoding
)
258 MutexLocker
lock(encodingRegistryMutex());
261 ASSERT(textCodecMap
);
262 TextCodecFactory factory
= textCodecMap
->get(encoding
.name());
263 ASSERT(factory
.function
);
264 return factory
.function(encoding
, factory
.additionalData
);
267 const char* atomicCanonicalTextEncodingName(const char* name
)
269 if (!name
|| !name
[0])
271 if (!textEncodingNameMap
)
272 buildBaseTextCodecMaps();
274 MutexLocker
lock(encodingRegistryMutex());
276 if (const char* atomicName
= textEncodingNameMap
->get(name
))
278 if (atomicDidExtendTextCodecMaps())
280 extendTextCodecMaps();
281 atomicSetDidExtendTextCodemMaps();
282 return textEncodingNameMap
->get(name
);
285 template <typename CharacterType
>
286 const char* atomicCanonicalTextEncodingName(const CharacterType
* characters
, size_t length
)
288 char buffer
[maxEncodingNameLength
+ 1];
290 for (size_t i
= 0; i
< length
; ++i
) {
291 char c
= static_cast<char>(characters
[i
]);
292 if (j
== maxEncodingNameLength
|| c
!= characters
[i
])
297 return atomicCanonicalTextEncodingName(buffer
);
300 const char* atomicCanonicalTextEncodingName(const String
& alias
)
305 if (alias
.contains(static_cast<UChar
>('\0')))
309 return atomicCanonicalTextEncodingName
<LChar
>(alias
.characters8(), alias
.length());
311 return atomicCanonicalTextEncodingName
<UChar
>(alias
.characters16(), alias
.length());
314 bool noExtendedTextEncodingNameUsed()
316 return !atomicDidExtendTextCodecMaps();
320 void dumpTextEncodingNameMap()
322 unsigned size
= textEncodingNameMap
->size();
323 fprintf(stderr
, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size
);
325 MutexLocker
lock(encodingRegistryMutex());
327 TextEncodingNameMap::const_iterator it
= textEncodingNameMap
->begin();
328 TextEncodingNameMap::const_iterator end
= textEncodingNameMap
->end();
329 for (; it
!= end
; ++it
)
330 fprintf(stderr
, "'%s' => '%s'\n", it
->key
, it
->value
);