Fix typo in 9b54bd30006c008b4a951331b273613d5bac3abf
[pm.git] / intl / unicharutil / nsUnicodeNormalizer.cpp
blob702386012a6c6558ca98835cb975b4e6316b14d4
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 /* This file is modified from JPNIC's mDNKit, it is under both MPL and
4 * JPNIC's license.
5 */
7 /* This Source Code Form is subject to the terms of the Mozilla Public
8 * License, v. 2.0. If a copy of the MPL was not distributed with this
9 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
12 * Copyright (c) 2000,2002 Japan Network Information Center.
13 * All rights reserved.
15 * By using this file, you agree to the terms and conditions set forth bellow.
17 * LICENSE TERMS AND CONDITIONS
19 * The following License Terms and Conditions apply, unless a different
20 * license is obtained from Japan Network Information Center ("JPNIC"),
21 * a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
22 * Chiyoda-ku, Tokyo 101-0047, Japan.
24 * 1. Use, Modification and Redistribution (including distribution of any
25 * modified or derived work) in source and/or binary forms is permitted
26 * under this License Terms and Conditions.
28 * 2. Redistribution of source code must retain the copyright notices as they
29 * appear in each source code file, this License Terms and Conditions.
31 * 3. Redistribution in binary form must reproduce the Copyright Notice,
32 * this License Terms and Conditions, in the documentation and/or other
33 * materials provided with the distribution. For the purposes of binary
34 * distribution the "Copyright Notice" refers to the following language:
35 * "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
37 * 4. The name of JPNIC may not be used to endorse or promote products
38 * derived from this Software without specific prior written approval of
39 * JPNIC.
41 * 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
42 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
43 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
44 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
46 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
47 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
48 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
49 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
50 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
51 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
54 #include <string.h>
56 #include "nsMemory.h"
57 #include "nsUnicodeNormalizer.h"
58 #include "nsString.h"
59 #include "mozilla/BinarySearch.h"
61 NS_IMPL_ISUPPORTS(nsUnicodeNormalizer, nsIUnicodeNormalizer)
64 nsUnicodeNormalizer::nsUnicodeNormalizer()
68 nsUnicodeNormalizer::~nsUnicodeNormalizer()
74 #define END_BIT 0x80000000
78 * Some constants for Hangul decomposition/composition.
79 * These things were taken from unicode book.
81 #define SBase 0xac00
82 #define LBase 0x1100
83 #define VBase 0x1161
84 #define TBase 0x11a7
85 #define LCount 19
86 #define VCount 21
87 #define TCount 28
88 #define SLast (SBase + LCount * VCount * TCount)
90 struct composition {
91 uint32_t c2; /* 2nd character */
92 uint32_t comp; /* composed character */
96 #include "normalization_data.h"
99 * Macro for multi-level index table.
101 #define LOOKUPTBL(vprefix, mprefix, v) \
102 DMAP(vprefix)[\
103 IMAP(vprefix)[\
104 IMAP(vprefix)[IDX0(mprefix, v)] + IDX1(mprefix, v)\
106 ].tbl[IDX2(mprefix, v)]
108 #define IDX0(mprefix, v) IDX_0(v, BITS1(mprefix), BITS2(mprefix))
109 #define IDX1(mprefix, v) IDX_1(v, BITS1(mprefix), BITS2(mprefix))
110 #define IDX2(mprefix, v) IDX_2(v, BITS1(mprefix), BITS2(mprefix))
112 #define IDX_0(v, bits1, bits2) ((v) >> ((bits1) + (bits2)))
113 #define IDX_1(v, bits1, bits2) (((v) >> (bits2)) & ((1 << (bits1)) - 1))
114 #define IDX_2(v, bits1, bits2) ((v) & ((1 << (bits2)) - 1))
116 #define BITS1(mprefix) mprefix ## _BITS_1
117 #define BITS2(mprefix) mprefix ## _BITS_2
119 #define IMAP(vprefix) vprefix ## _imap
120 #define DMAP(vprefix) vprefix ## _table
121 #define SEQ(vprefix) vprefix ## _seq
123 static int32_t
124 canonclass(uint32_t c) {
125 /* Look up canonicalclass table. */
126 return (LOOKUPTBL(canon_class, CANON_CLASS, c));
129 static int32_t
130 decompose_char(uint32_t c, const uint32_t **seqp)
132 /* Look up decomposition table. */
133 int32_t seqidx = LOOKUPTBL(decompose, DECOMP, c);
134 *seqp = SEQ(decompose) + (seqidx & ~DECOMP_COMPAT);
135 return (seqidx);
138 static int32_t
139 compose_char(uint32_t c,
140 const struct composition **compp)
142 /* Look up composition table. */
143 int32_t seqidx = LOOKUPTBL(compose, CANON_COMPOSE, c);
144 *compp = SEQ(compose) + (seqidx & 0xffff);
145 return (seqidx >> 16);
148 static nsresult
149 mdn__unicode_decompose(int32_t compat, uint32_t *v, size_t vlen,
150 uint32_t c, int32_t *decomp_lenp)
152 uint32_t *vorg = v;
153 int32_t seqidx;
154 const uint32_t *seq;
156 //assert(v != nullptr && vlen >= 0 && decomp_lenp != nullptr);
159 * First, check for Hangul.
161 if (SBase <= c && c < SLast) {
162 int32_t idx, t_offset, v_offset, l_offset;
164 idx = c - SBase;
165 t_offset = idx % TCount;
166 idx /= TCount;
167 v_offset = idx % VCount;
168 l_offset = idx / VCount;
169 if ((t_offset == 0 && vlen < 2) || (t_offset > 0 && vlen < 3))
170 return (NS_ERROR_UNORM_MOREOUTPUT);
171 *v++ = LBase + l_offset;
172 *v++ = VBase + v_offset;
173 if (t_offset > 0)
174 *v++ = TBase + t_offset;
175 *decomp_lenp = v - vorg;
176 return (NS_OK);
180 * Look up decomposition table. If no decomposition is defined
181 * or if it is a compatibility decomosition when canonical
182 * decomposition requested, return 'NS_SUCCESS_UNORM_NOTFOUND'.
184 seqidx = decompose_char(c, &seq);
185 if (seqidx == 0 || (compat == 0 && (seqidx & DECOMP_COMPAT) != 0))
186 return (NS_SUCCESS_UNORM_NOTFOUND);
189 * Copy the decomposed sequence. The end of the sequence are
190 * marked with END_BIT.
192 do {
193 uint32_t c;
194 int32_t dlen;
195 nsresult r;
197 c = *seq & ~END_BIT;
199 /* Decompose recursively. */
200 r = mdn__unicode_decompose(compat, v, vlen, c, &dlen);
201 if (r == NS_OK) {
202 v += dlen;
203 vlen -= dlen;
204 } else if (r == NS_SUCCESS_UNORM_NOTFOUND) {
205 if (vlen < 1)
206 return (NS_ERROR_UNORM_MOREOUTPUT);
207 *v++ = c;
208 vlen--;
209 } else {
210 return (r);
213 } while ((*seq++ & END_BIT) == 0);
215 *decomp_lenp = v - vorg;
217 return (NS_OK);
220 static int32_t
221 mdn__unicode_iscompositecandidate(uint32_t c)
223 const struct composition *dummy;
225 /* Check for Hangul */
226 if ((LBase <= c && c < LBase + LCount) || (SBase <= c && c < SLast))
227 return (1);
230 * Look up composition table. If there are no composition
231 * that begins with the given character, it is not a
232 * composition candidate.
234 if (compose_char(c, &dummy) == 0)
235 return (0);
236 else
237 return (1);
240 namespace {
242 struct SequenceAdaptor
244 const composition* const mSequence;
245 explicit SequenceAdaptor(const composition* aSequence) : mSequence(aSequence) {}
246 uint32_t operator[](size_t aIdx) const {
247 return mSequence[aIdx].c2;
251 } // namespace
253 static nsresult
254 mdn__unicode_compose(uint32_t c1, uint32_t c2, uint32_t *compp)
256 int32_t n;
257 const struct composition *cseq;
259 //assert(compp != nullptr);
262 * Check for Hangul.
264 if (LBase <= c1 && c1 < LBase + LCount &&
265 VBase <= c2 && c2 < VBase + VCount) {
267 * Hangul L and V.
269 *compp = SBase +
270 ((c1 - LBase) * VCount + (c2 - VBase)) * TCount;
271 return (NS_OK);
272 } else if (SBase <= c1 && c1 < SLast &&
273 TBase <= c2 && c2 < TBase + TCount &&
274 (c1 - SBase) % TCount == 0) {
276 * Hangul LV and T.
278 *compp = c1 + (c2 - TBase);
279 return (NS_OK);
283 * Look up composition table. If the result is 0, no composition
284 * is defined. Otherwise, upper 16bits of the result contains
285 * the number of composition that begins with 'c1', and the lower
286 * 16bits is the offset in 'compose_seq'.
288 if ((n = compose_char(c1, &cseq)) == 0)
289 return (NS_SUCCESS_UNORM_NOTFOUND);
292 * The composite sequences are sorted by the 2nd character 'c2'.
293 * So we can use binary search.
295 size_t idx;
296 if (mozilla::BinarySearch(SequenceAdaptor(cseq), 0, n, c2, &idx)) {
297 *compp = cseq[idx].comp;
298 return (NS_OK);
301 return (NS_SUCCESS_UNORM_NOTFOUND);
305 #define WORKBUF_SIZE 128
306 #define WORKBUF_SIZE_MAX 10000
308 typedef struct {
309 int32_t cur; /* pointing now processing character */
310 int32_t last; /* pointing just after the last character */
311 int32_t size; /* size of UCS and CLASS array */
312 uint32_t *ucs; /* UCS-4 characters */
313 int32_t *cclass; /* and their canonical classes */
314 uint32_t ucs_buf[WORKBUF_SIZE]; /* local buffer */
315 int32_t class_buf[WORKBUF_SIZE]; /* ditto */
316 } workbuf_t;
318 static nsresult decompose(workbuf_t *wb, uint32_t c, int32_t compat);
319 static void get_class(workbuf_t *wb);
320 static void reorder(workbuf_t *wb);
321 static void compose(workbuf_t *wb);
322 static nsresult flush_before_cur(workbuf_t *wb, nsAString& aToStr);
323 static void workbuf_init(workbuf_t *wb);
324 static void workbuf_free(workbuf_t *wb);
325 static nsresult workbuf_extend(workbuf_t *wb);
326 static nsresult workbuf_append(workbuf_t *wb, uint32_t c);
327 static void workbuf_shift(workbuf_t *wb, int32_t shift);
328 static void workbuf_removevoid(workbuf_t *wb);
331 static nsresult
332 mdn_normalize(bool do_composition, bool compat,
333 const nsAString& aSrcStr, nsAString& aToStr)
335 workbuf_t wb;
336 nsresult r = NS_OK;
338 * Initialize working buffer.
340 workbuf_init(&wb);
342 nsAString::const_iterator start, end;
343 aSrcStr.BeginReading(start);
344 aSrcStr.EndReading(end);
346 while (start != end) {
347 uint32_t c;
348 char16_t curChar;
350 //assert(wb.cur == wb.last);
353 * Get one character from 'from'.
355 curChar= *start++;
357 if (NS_IS_HIGH_SURROGATE(curChar) && start != end && NS_IS_LOW_SURROGATE(*(start)) ) {
358 c = SURROGATE_TO_UCS4(curChar, *start);
359 ++start;
360 } else {
361 c = curChar;
365 * Decompose it.
367 if ((r = decompose(&wb, c, compat)) != NS_OK)
368 break;
371 * Get canonical class.
373 get_class(&wb);
376 * Reorder & compose.
378 for (; wb.cur < wb.last; wb.cur++) {
379 if (wb.cur == 0) {
380 continue;
381 } else if (wb.cclass[wb.cur] > 0) {
383 * This is not a starter. Try reordering.
384 * Note that characters up to it are
385 * already in canonical order.
387 reorder(&wb);
388 continue;
392 * This is a starter character, and there are
393 * some characters before it. Those characters
394 * have been reordered properly, and
395 * ready for composition.
397 if (do_composition && wb.cclass[0] == 0)
398 compose(&wb);
401 * If CUR points to a starter character,
402 * then process of characters before CUR are
403 * already finished, because any further
404 * reordering/composition for them are blocked
405 * by the starter CUR points.
407 if (wb.cur > 0 && wb.cclass[wb.cur] == 0) {
408 /* Flush everything before CUR. */
409 r = flush_before_cur(&wb, aToStr);
410 if (r != NS_OK)
411 break;
416 if (r == NS_OK) {
417 if (do_composition && wb.cur > 0 && wb.cclass[0] == 0) {
419 * There is some characters left in WB.
420 * They are ordered, but not composed yet.
421 * Now CUR points just after the last character in WB,
422 * and since compose() tries to compose characters
423 * between top and CUR inclusive, we must make CUR
424 * one character back during compose().
426 wb.cur--;
427 compose(&wb);
428 wb.cur++;
431 * Call this even when WB.CUR == 0, to make TO
432 * NUL-terminated.
434 r = flush_before_cur(&wb, aToStr);
437 workbuf_free(&wb);
439 return (r);
442 static nsresult
443 decompose(workbuf_t *wb, uint32_t c, int32_t compat) {
444 nsresult r;
445 int32_t dec_len;
447 again:
448 r = mdn__unicode_decompose(compat, wb->ucs + wb->last,
449 wb->size - wb->last, c, &dec_len);
450 switch (r) {
451 case NS_OK:
452 wb->last += dec_len;
453 return (NS_OK);
454 case NS_SUCCESS_UNORM_NOTFOUND:
455 return (workbuf_append(wb, c));
456 case NS_ERROR_UNORM_MOREOUTPUT:
457 if ((r = workbuf_extend(wb)) != NS_OK)
458 return (r);
459 if (wb->size > WORKBUF_SIZE_MAX) {
460 // "mdn__unormalize_form*: " "working buffer too large\n"
461 return (NS_ERROR_FAILURE);
463 goto again;
464 default:
465 return (r);
467 /* NOTREACHED */
470 static void
471 get_class(workbuf_t *wb) {
472 int32_t i;
474 for (i = wb->cur; i < wb->last; i++)
475 wb->cclass[i] = canonclass(wb->ucs[i]);
478 static void
479 reorder(workbuf_t *wb) {
480 uint32_t c;
481 int32_t i;
482 int32_t cclass;
484 //assert(wb != nullptr);
486 i = wb->cur;
487 c = wb->ucs[i];
488 cclass = wb->cclass[i];
490 while (i > 0 && wb->cclass[i - 1] > cclass) {
491 wb->ucs[i] = wb->ucs[i - 1];
492 wb->cclass[i] =wb->cclass[i - 1];
493 i--;
494 wb->ucs[i] = c;
495 wb->cclass[i] = cclass;
499 static void
500 compose(workbuf_t *wb) {
501 int32_t cur;
502 uint32_t *ucs;
503 int32_t *cclass;
504 int32_t last_class;
505 int32_t nvoids;
506 int32_t i;
508 //assert(wb != nullptr && wb->cclass[0] == 0);
510 cur = wb->cur;
511 ucs = wb->ucs;
512 cclass = wb->cclass;
515 * If there are no decomposition sequence that begins with
516 * the top character, composition is impossible.
518 if (!mdn__unicode_iscompositecandidate(ucs[0]))
519 return;
521 last_class = 0;
522 nvoids = 0;
523 for (i = 1; i <= cur; i++) {
524 uint32_t c;
525 int32_t cl = cclass[i];
527 if ((last_class < cl || cl == 0) &&
528 mdn__unicode_compose(ucs[0], ucs[i],
529 &c) == NS_OK) {
531 * Replace the top character with the composed one.
533 ucs[0] = c;
534 cclass[0] = canonclass(c);
536 cclass[i] = -1; /* void this character */
537 nvoids++;
538 } else {
539 last_class = cl;
543 /* Purge void characters, if any. */
544 if (nvoids > 0)
545 workbuf_removevoid(wb);
548 static nsresult
549 flush_before_cur(workbuf_t *wb, nsAString& aToStr)
551 int32_t i;
553 for (i = 0; i < wb->cur; i++) {
554 if (!IS_IN_BMP(wb->ucs[i])) {
555 aToStr.Append((char16_t)H_SURROGATE(wb->ucs[i]));
556 aToStr.Append((char16_t)L_SURROGATE(wb->ucs[i]));
557 } else {
558 aToStr.Append((char16_t)(wb->ucs[i]));
562 workbuf_shift(wb, wb->cur);
564 return (NS_OK);
567 static void
568 workbuf_init(workbuf_t *wb) {
569 wb->cur = 0;
570 wb->last = 0;
571 wb->size = WORKBUF_SIZE;
572 wb->ucs = wb->ucs_buf;
573 wb->cclass = wb->class_buf;
576 static void
577 workbuf_free(workbuf_t *wb) {
578 if (wb->ucs != wb->ucs_buf) {
579 nsMemory::Free(wb->ucs);
580 nsMemory::Free(wb->cclass);
584 static nsresult
585 workbuf_extend(workbuf_t *wb) {
586 int32_t newsize = wb->size * 3;
588 if (wb->ucs == wb->ucs_buf) {
589 wb->ucs = (uint32_t*)nsMemory::Alloc(sizeof(wb->ucs[0]) * newsize);
590 if (!wb->ucs)
591 return NS_ERROR_OUT_OF_MEMORY;
592 wb->cclass = (int32_t*)nsMemory::Alloc(sizeof(wb->cclass[0]) * newsize);
593 if (!wb->cclass) {
594 nsMemory::Free(wb->ucs);
595 wb->ucs = nullptr;
596 return NS_ERROR_OUT_OF_MEMORY;
598 } else {
599 void* buf = nsMemory::Realloc(wb->ucs, sizeof(wb->ucs[0]) * newsize);
600 if (!buf)
601 return NS_ERROR_OUT_OF_MEMORY;
602 wb->ucs = (uint32_t*)buf;
603 buf = nsMemory::Realloc(wb->cclass, sizeof(wb->cclass[0]) * newsize);
604 if (!buf)
605 return NS_ERROR_OUT_OF_MEMORY;
606 wb->cclass = (int32_t*)buf;
608 return (NS_OK);
611 static nsresult
612 workbuf_append(workbuf_t *wb, uint32_t c) {
613 nsresult r;
615 if (wb->last >= wb->size && (r = workbuf_extend(wb)) != NS_OK)
616 return (r);
617 wb->ucs[wb->last++] = c;
618 return (NS_OK);
621 static void
622 workbuf_shift(workbuf_t *wb, int32_t shift) {
623 int32_t nmove;
625 //assert(wb != nullptr && wb->cur >= shift);
627 nmove = wb->last - shift;
628 memmove(&wb->ucs[0], &wb->ucs[shift],
629 nmove * sizeof(wb->ucs[0]));
630 memmove(&wb->cclass[0], &wb->cclass[shift],
631 nmove * sizeof(wb->cclass[0]));
632 wb->cur -= shift;
633 wb->last -= shift;
636 static void
637 workbuf_removevoid(workbuf_t *wb) {
638 int32_t i, j;
639 int32_t last = wb->last;
641 for (i = j = 0; i < last; i++) {
642 if (wb->cclass[i] >= 0) {
643 if (j < i) {
644 wb->ucs[j] = wb->ucs[i];
645 wb->cclass[j] = wb->cclass[i];
647 j++;
650 wb->cur -= last - j;
651 wb->last = j;
654 nsresult
655 nsUnicodeNormalizer::NormalizeUnicodeNFD( const nsAString& aSrc, nsAString& aDest)
657 return mdn_normalize(false, false, aSrc, aDest);
660 nsresult
661 nsUnicodeNormalizer::NormalizeUnicodeNFC( const nsAString& aSrc, nsAString& aDest)
663 return mdn_normalize(true, false, aSrc, aDest);
666 nsresult
667 nsUnicodeNormalizer::NormalizeUnicodeNFKD( const nsAString& aSrc, nsAString& aDest)
669 return mdn_normalize(false, true, aSrc, aDest);
672 nsresult
673 nsUnicodeNormalizer::NormalizeUnicodeNFKC( const nsAString& aSrc, nsAString& aDest)
675 return mdn_normalize(true, true, aSrc, aDest);
678 bool
679 nsUnicodeNormalizer::Compose(uint32_t a, uint32_t b, uint32_t *ab)
681 return mdn__unicode_compose(a, b, ab) == NS_OK;
684 bool
685 nsUnicodeNormalizer::DecomposeNonRecursively(uint32_t c, uint32_t *c1, uint32_t *c2)
687 // We can't use mdn__unicode_decompose here, because that does a recursive
688 // decomposition that may yield more than two characters, but the harfbuzz
689 // callback wants just a single-step decomp that is guaranteed to produce
690 // no more than two characters. So we do a low-level lookup in the table
691 // of decomp sequences.
692 const uint32_t *seq;
693 uint32_t seqidx = decompose_char(c, &seq);
694 if (seqidx == 0 || ((seqidx & DECOMP_COMPAT) != 0)) {
695 return false;
697 *c1 = *seq & ~END_BIT;
698 if (*seq & END_BIT) {
699 *c2 = 0;
700 } else {
701 *c2 = *++seq & ~END_BIT;
703 return true;