1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 /* This file is modified from JPNIC's mDNKit, it is under both MPL and
7 /* This Source Code Form is subject to the terms of the Mozilla Public
8 * License, v. 2.0. If a copy of the MPL was not distributed with this
9 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
12 * Copyright (c) 2000,2002 Japan Network Information Center.
13 * All rights reserved.
15 * By using this file, you agree to the terms and conditions set forth bellow.
17 * LICENSE TERMS AND CONDITIONS
19 * The following License Terms and Conditions apply, unless a different
20 * license is obtained from Japan Network Information Center ("JPNIC"),
21 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
22 * Chiyoda-ku, Tokyo 101-0047, Japan.
24 * 1. Use, Modification and Redistribution (including distribution of any
25 * modified or derived work) in source and/or binary forms is permitted
26 * under this License Terms and Conditions.
28 * 2. Redistribution of source code must retain the copyright notices as they
29 * appear in each source code file, this License Terms and Conditions.
31 * 3. Redistribution in binary form must reproduce the Copyright Notice,
32 * this License Terms and Conditions, in the documentation and/or other
33 * materials provided with the distribution. For the purposes of binary
34 * distribution the "Copyright Notice" refers to the following language:
35 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
37 * 4. The name of JPNIC may not be used to endorse or promote products
38 * derived from this Software without specific prior written approval of
41 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
44 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
46 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
47 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
48 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
49 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
50 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
51 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
57 #include "nsUnicodeNormalizer.h"
59 #include "mozilla/BinarySearch.h"
61 NS_IMPL_ISUPPORTS(nsUnicodeNormalizer
, nsIUnicodeNormalizer
)
64 nsUnicodeNormalizer::nsUnicodeNormalizer()
68 nsUnicodeNormalizer::~nsUnicodeNormalizer()
74 #define END_BIT 0x80000000
78 * Some constants for Hangul decomposition/composition.
79 * These things were taken from unicode book.
88 #define SLast (SBase + LCount * VCount * TCount)
91 uint32_t c2
; /* 2nd character */
92 uint32_t comp
; /* composed character */
96 #include "normalization_data.h"
99 * Macro for multi-level index table.
101 #define LOOKUPTBL(vprefix, mprefix, v) \
104 IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\
106 ].tbl[IDX2(mprefix, v)]
108 #define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix))
109 #define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix))
110 #define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix))
112 #define IDX_0(v, bits1, bits2) ((v) >> ((bits1) + (bits2)))
113 #define IDX_1(v, bits1, bits2) (((v) >> (bits2)) & ((1 << (bits1)) - 1))
114 #define IDX_2(v, bits1, bits2) ((v) & ((1 << (bits2)) - 1))
116 #define BITS1(mprefix) mprefix ## _BITS_1
117 #define BITS2(mprefix) mprefix ## _BITS_2
119 #define IMAP(vprefix) vprefix ## _imap
120 #define DMAP(vprefix) vprefix ## _table
121 #define SEQ(vprefix) vprefix ## _seq
124 canonclass(uint32_t c
) {
125 /* Look up canonicalclass table. */
126 return (LOOKUPTBL(canon_class
, CANON_CLASS
, c
));
130 decompose_char(uint32_t c
, const uint32_t **seqp
)
132 /* Look up decomposition table. */
133 int32_t seqidx
= LOOKUPTBL(decompose
, DECOMP
, c
);
134 *seqp
= SEQ(decompose
) + (seqidx
& ~DECOMP_COMPAT
);
139 compose_char(uint32_t c
,
140 const struct composition
**compp
)
142 /* Look up composition table. */
143 int32_t seqidx
= LOOKUPTBL(compose
, CANON_COMPOSE
, c
);
144 *compp
= SEQ(compose
) + (seqidx
& 0xffff);
145 return (seqidx
>> 16);
149 mdn__unicode_decompose(int32_t compat
, uint32_t *v
, size_t vlen
,
150 uint32_t c
, int32_t *decomp_lenp
)
156 //assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr);
159 * First, check for Hangul.
161 if (SBase
<= c
&& c
< SLast
) {
162 int32_t idx
, t_offset
, v_offset
, l_offset
;
165 t_offset
= idx
% TCount
;
167 v_offset
= idx
% VCount
;
168 l_offset
= idx
/ VCount
;
169 if ((t_offset
== 0 && vlen
< 2) || (t_offset
> 0 && vlen
< 3))
170 return (NS_ERROR_UNORM_MOREOUTPUT
);
171 *v
++ = LBase
+ l_offset
;
172 *v
++ = VBase
+ v_offset
;
174 *v
++ = TBase
+ t_offset
;
175 *decomp_lenp
= v
- vorg
;
180 * Look up decomposition table. If no decomposition is defined
181 * or if it is a compatibility decomosition when canonical
182 * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'.
184 seqidx
= decompose_char(c
, &seq
);
185 if (seqidx
== 0 || (compat
== 0 && (seqidx
& DECOMP_COMPAT
) != 0))
186 return (NS_SUCCESS_UNORM_NOTFOUND
);
189 * Copy the decomposed sequence. The end of the sequence are
190 * marked with END_BIT.
199 /* Decompose recursively. */
200 r
= mdn__unicode_decompose(compat
, v
, vlen
, c
, &dlen
);
204 } else if (r
== NS_SUCCESS_UNORM_NOTFOUND
) {
206 return (NS_ERROR_UNORM_MOREOUTPUT
);
213 } while ((*seq
++ & END_BIT
) == 0);
215 *decomp_lenp
= v
- vorg
;
221 mdn__unicode_iscompositecandidate(uint32_t c
)
223 const struct composition
*dummy
;
225 /* Check for Hangul */
226 if ((LBase
<= c
&& c
< LBase
+ LCount
) || (SBase
<= c
&& c
< SLast
))
230 * Look up composition table. If there are no composition
231 * that begins with the given character, it is not a
232 * composition candidate.
234 if (compose_char(c
, &dummy
) == 0)
242 struct SequenceAdaptor
244 const composition
* const mSequence
;
245 explicit SequenceAdaptor(const composition
* aSequence
) : mSequence(aSequence
) {}
246 uint32_t operator[](size_t aIdx
) const {
247 return mSequence
[aIdx
].c2
;
254 mdn__unicode_compose(uint32_t c1
, uint32_t c2
, uint32_t *compp
)
257 const struct composition
*cseq
;
259 //assert(compp != nullptr);
264 if (LBase
<= c1
&& c1
< LBase
+ LCount
&&
265 VBase
<= c2
&& c2
< VBase
+ VCount
) {
270 ((c1
- LBase
) * VCount
+ (c2
- VBase
)) * TCount
;
272 } else if (SBase
<= c1
&& c1
< SLast
&&
273 TBase
<= c2
&& c2
< TBase
+ TCount
&&
274 (c1
- SBase
) % TCount
== 0) {
278 *compp
= c1
+ (c2
- TBase
);
283 * Look up composition table. If the result is 0, no composition
284 * is defined. Otherwise, upper 16bits of the result contains
285 * the number of composition that begins with 'c1', and the lower
286 * 16bits is the offset in 'compose_seq'.
288 if ((n
= compose_char(c1
, &cseq
)) == 0)
289 return (NS_SUCCESS_UNORM_NOTFOUND
);
292 * The composite sequences are sorted by the 2nd character 'c2'.
293 * So we can use binary search.
296 if (mozilla::BinarySearch(SequenceAdaptor(cseq
), 0, n
, c2
, &idx
)) {
297 *compp
= cseq
[idx
].comp
;
301 return (NS_SUCCESS_UNORM_NOTFOUND
);
305 #define WORKBUF_SIZE 128
306 #define WORKBUF_SIZE_MAX 10000
309 int32_t cur
; /* pointing now processing character */
310 int32_t last
; /* pointing just after the last character */
311 int32_t size
; /* size of UCS and CLASS array */
312 uint32_t *ucs
; /* UCS-4 characters */
313 int32_t *cclass
; /* and their canonical classes */
314 uint32_t ucs_buf
[WORKBUF_SIZE
]; /* local buffer */
315 int32_t class_buf
[WORKBUF_SIZE
]; /* ditto */
318 static nsresult
decompose(workbuf_t
*wb
, uint32_t c
, int32_t compat
);
319 static void get_class(workbuf_t
*wb
);
320 static void reorder(workbuf_t
*wb
);
321 static void compose(workbuf_t
*wb
);
322 static nsresult
flush_before_cur(workbuf_t
*wb
, nsAString
& aToStr
);
323 static void workbuf_init(workbuf_t
*wb
);
324 static void workbuf_free(workbuf_t
*wb
);
325 static nsresult
workbuf_extend(workbuf_t
*wb
);
326 static nsresult
workbuf_append(workbuf_t
*wb
, uint32_t c
);
327 static void workbuf_shift(workbuf_t
*wb
, int32_t shift
);
328 static void workbuf_removevoid(workbuf_t
*wb
);
332 mdn_normalize(bool do_composition
, bool compat
,
333 const nsAString
& aSrcStr
, nsAString
& aToStr
)
338 * Initialize working buffer.
342 nsAString::const_iterator start
, end
;
343 aSrcStr
.BeginReading(start
);
344 aSrcStr
.EndReading(end
);
346 while (start
!= end
) {
350 //assert(wb.cur == wb.last);
353 * Get one character from 'from'.
357 if (NS_IS_HIGH_SURROGATE(curChar
) && start
!= end
&& NS_IS_LOW_SURROGATE(*(start
)) ) {
358 c
= SURROGATE_TO_UCS4(curChar
, *start
);
367 if ((r
= decompose(&wb
, c
, compat
)) != NS_OK
)
371 * Get canonical class.
378 for (; wb
.cur
< wb
.last
; wb
.cur
++) {
381 } else if (wb
.cclass
[wb
.cur
] > 0) {
383 * This is not a starter. Try reordering.
384 * Note that characters up to it are
385 * already in canonical order.
392 * This is a starter character, and there are
393 * some characters before it. Those characters
394 * have been reordered properly, and
395 * ready for composition.
397 if (do_composition
&& wb
.cclass
[0] == 0)
401 * If CUR points to a starter character,
402 * then process of characters before CUR are
403 * already finished, because any further
404 * reordering/composition for them are blocked
405 * by the starter CUR points.
407 if (wb
.cur
> 0 && wb
.cclass
[wb
.cur
] == 0) {
408 /* Flush everything before CUR. */
409 r
= flush_before_cur(&wb
, aToStr
);
417 if (do_composition
&& wb
.cur
> 0 && wb
.cclass
[0] == 0) {
419 * There is some characters left in WB.
420 * They are ordered, but not composed yet.
421 * Now CUR points just after the last character in WB,
422 * and since compose() tries to compose characters
423 * between top and CUR inclusive, we must make CUR
424 * one character back during compose().
431 * Call this even when WB.CUR == 0, to make TO
434 r
= flush_before_cur(&wb
, aToStr
);
443 decompose(workbuf_t
*wb
, uint32_t c
, int32_t compat
) {
448 r
= mdn__unicode_decompose(compat
, wb
->ucs
+ wb
->last
,
449 wb
->size
- wb
->last
, c
, &dec_len
);
454 case NS_SUCCESS_UNORM_NOTFOUND
:
455 return (workbuf_append(wb
, c
));
456 case NS_ERROR_UNORM_MOREOUTPUT
:
457 if ((r
= workbuf_extend(wb
)) != NS_OK
)
459 if (wb
->size
> WORKBUF_SIZE_MAX
) {
460 // "mdn__unormalize_form*: " "working buffer too large\n"
461 return (NS_ERROR_FAILURE
);
471 get_class(workbuf_t
*wb
) {
474 for (i
= wb
->cur
; i
< wb
->last
; i
++)
475 wb
->cclass
[i
] = canonclass(wb
->ucs
[i
]);
479 reorder(workbuf_t
*wb
) {
484 //assert(wb != nullptr);
488 cclass
= wb
->cclass
[i
];
490 while (i
> 0 && wb
->cclass
[i
- 1] > cclass
) {
491 wb
->ucs
[i
] = wb
->ucs
[i
- 1];
492 wb
->cclass
[i
] =wb
->cclass
[i
- 1];
495 wb
->cclass
[i
] = cclass
;
500 compose(workbuf_t
*wb
) {
508 //assert(wb != nullptr && wb->cclass[0] == 0);
515 * If there are no decomposition sequence that begins with
516 * the top character, composition is impossible.
518 if (!mdn__unicode_iscompositecandidate(ucs
[0]))
523 for (i
= 1; i
<= cur
; i
++) {
525 int32_t cl
= cclass
[i
];
527 if ((last_class
< cl
|| cl
== 0) &&
528 mdn__unicode_compose(ucs
[0], ucs
[i
],
531 * Replace the top character with the composed one.
534 cclass
[0] = canonclass(c
);
536 cclass
[i
] = -1; /* void this character */
543 /* Purge void characters, if any. */
545 workbuf_removevoid(wb
);
549 flush_before_cur(workbuf_t
*wb
, nsAString
& aToStr
)
553 for (i
= 0; i
< wb
->cur
; i
++) {
554 if (!IS_IN_BMP(wb
->ucs
[i
])) {
555 aToStr
.Append((char16_t
)H_SURROGATE(wb
->ucs
[i
]));
556 aToStr
.Append((char16_t
)L_SURROGATE(wb
->ucs
[i
]));
558 aToStr
.Append((char16_t
)(wb
->ucs
[i
]));
562 workbuf_shift(wb
, wb
->cur
);
568 workbuf_init(workbuf_t
*wb
) {
571 wb
->size
= WORKBUF_SIZE
;
572 wb
->ucs
= wb
->ucs_buf
;
573 wb
->cclass
= wb
->class_buf
;
577 workbuf_free(workbuf_t
*wb
) {
578 if (wb
->ucs
!= wb
->ucs_buf
) {
579 nsMemory::Free(wb
->ucs
);
580 nsMemory::Free(wb
->cclass
);
585 workbuf_extend(workbuf_t
*wb
) {
586 int32_t newsize
= wb
->size
* 3;
588 if (wb
->ucs
== wb
->ucs_buf
) {
589 wb
->ucs
= (uint32_t*)nsMemory::Alloc(sizeof(wb
->ucs
[0]) * newsize
);
591 return NS_ERROR_OUT_OF_MEMORY
;
592 wb
->cclass
= (int32_t*)nsMemory::Alloc(sizeof(wb
->cclass
[0]) * newsize
);
594 nsMemory::Free(wb
->ucs
);
596 return NS_ERROR_OUT_OF_MEMORY
;
599 void* buf
= nsMemory::Realloc(wb
->ucs
, sizeof(wb
->ucs
[0]) * newsize
);
601 return NS_ERROR_OUT_OF_MEMORY
;
602 wb
->ucs
= (uint32_t*)buf
;
603 buf
= nsMemory::Realloc(wb
->cclass
, sizeof(wb
->cclass
[0]) * newsize
);
605 return NS_ERROR_OUT_OF_MEMORY
;
606 wb
->cclass
= (int32_t*)buf
;
612 workbuf_append(workbuf_t
*wb
, uint32_t c
) {
615 if (wb
->last
>= wb
->size
&& (r
= workbuf_extend(wb
)) != NS_OK
)
617 wb
->ucs
[wb
->last
++] = c
;
622 workbuf_shift(workbuf_t
*wb
, int32_t shift
) {
625 //assert(wb != nullptr && wb->cur >= shift);
627 nmove
= wb
->last
- shift
;
628 memmove(&wb
->ucs
[0], &wb
->ucs
[shift
],
629 nmove
* sizeof(wb
->ucs
[0]));
630 memmove(&wb
->cclass
[0], &wb
->cclass
[shift
],
631 nmove
* sizeof(wb
->cclass
[0]));
637 workbuf_removevoid(workbuf_t
*wb
) {
639 int32_t last
= wb
->last
;
641 for (i
= j
= 0; i
< last
; i
++) {
642 if (wb
->cclass
[i
] >= 0) {
644 wb
->ucs
[j
] = wb
->ucs
[i
];
645 wb
->cclass
[j
] = wb
->cclass
[i
];
655 nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString
& aSrc
, nsAString
& aDest
)
657 return mdn_normalize(false, false, aSrc
, aDest
);
661 nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString
& aSrc
, nsAString
& aDest
)
663 return mdn_normalize(true, false, aSrc
, aDest
);
667 nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString
& aSrc
, nsAString
& aDest
)
669 return mdn_normalize(false, true, aSrc
, aDest
);
673 nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString
& aSrc
, nsAString
& aDest
)
675 return mdn_normalize(true, true, aSrc
, aDest
);
679 nsUnicodeNormalizer::Compose(uint32_t a
, uint32_t b
, uint32_t *ab
)
681 return mdn__unicode_compose(a
, b
, ab
) == NS_OK
;
685 nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c
, uint32_t *c1
, uint32_t *c2
)
687 // We can't use mdn__unicode_decompose here, because that does a recursive
688 // decomposition that may yield more than two characters, but the harfbuzz
689 // callback wants just a single-step decomp that is guaranteed to produce
690 // no more than two characters. So we do a low-level lookup in the table
691 // of decomp sequences.
693 uint32_t seqidx
= decompose_char(c
, &seq
);
694 if (seqidx
== 0 || ((seqidx
& DECOMP_COMPAT
) != 0)) {
697 *c1
= *seq
& ~END_BIT
;
698 if (*seq
& END_BIT
) {
701 *c2
= *++seq
& ~END_BIT
;