2 //===----------------------------------------------------------------------===//
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //===----------------------------------------------------------------------===//
10 // WARNING, this entire header is generated by
11 // utils/generate_indic_conjunct_break_table.py
14 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
16 // See Terms of Use <https://www.unicode.org/copyright.html>
17 // for definitions of Unicode Inc.'s Data Files and Software.
19 // NOTICE TO USER: Carefully read the following legal agreement.
20 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
21 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
22 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
23 // TERMS AND CONDITIONS OF THIS AGREEMENT.
24 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
25 // THE DATA FILES OR SOFTWARE.
27 // COPYRIGHT AND PERMISSION NOTICE
29 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
30 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
32 // Permission is hereby granted, free of charge, to any person obtaining
33 // a copy of the Unicode data files and any associated documentation
34 // (the "Data Files") or Unicode software and any associated documentation
35 // (the "Software") to deal in the Data Files or Software
36 // without restriction, including without limitation the rights to use,
37 // copy, modify, merge, publish, distribute, and/or sell copies of
38 // the Data Files or Software, and to permit persons to whom the Data Files
39 // or Software are furnished to do so, provided that either
40 // (a) this copyright and permission notice appear with all copies
41 // of the Data Files or Software, or
42 // (b) this copyright and permission notice appear in associated
45 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
46 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
47 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
48 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
49 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
50 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
51 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
52 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
53 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
54 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
56 // Except as contained in this notice, the name of a copyright holder
57 // shall not be used in advertising or otherwise to promote the sale,
58 // use or other dealings in these Data Files or Software without prior
59 // written authorization of the copyright holder.
61 #ifndef _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
62 #define _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H
64 #include <__algorithm/ranges_upper_bound.h>
66 #include <__cstddef/ptrdiff_t.h>
67 #include <__iterator/access.h>
70 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
71 # pragma GCC system_header
74 _LIBCPP_BEGIN_NAMESPACE_STD
76 #if _LIBCPP_STD_VER >= 20
78 namespace __indic_conjunct_break
{
80 enum class __property
: uint8_t {
81 // Values generated from the data files.
86 // The code unit has none of above properties.
90 /// The entries of the indic conjunct break property table.
92 /// The data is generated from
93 /// - https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
95 /// The data has 3 values
96 /// - bits [0, 1] The property. One of the values generated from the datafiles
97 /// of \ref __property
98 /// - bits [2, 10] The size of the range.
99 /// - bits [11, 31] The lower bound code point of the range. The upper bound of
100 /// the range is lower bound + size.
102 /// The 9 bits for the size allow a maximum range of 512 elements. Some ranges
103 /// in the Unicode tables are larger. They are stored in multiple consecutive
104 /// ranges in the data table. An alternative would be to store the sizes in a
105 /// separate 16-bit value. The original MSVC STL code had such an approach, but
106 /// this approach uses less space for the data and is about 4% faster in the
107 /// following benchmark.
108 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
110 _LIBCPP_HIDE_FROM_ABI
inline constexpr uint32_t __entries
[201] = {
314 /// Returns the indic conjuct break property of a code point.
315 [[nodiscard
]] _LIBCPP_HIDE_FROM_ABI
constexpr __property
__get_property(const char32_t __code_point
) noexcept
{
316 // The algorithm searches for the upper bound of the range and, when found,
317 // steps back one entry. This algorithm is used since the code point can be
318 // anywhere in the range. After a lower bound is found the next step is to
319 // compare whether the code unit is indeed in the range.
321 // Since the entry contains a code unit, size, and property the code point
322 // being sought needs to be adjusted. Just shifting the code point to the
323 // proper position doesn't work; suppose an entry has property 0, size 1,
324 // and lower bound 3. This results in the entry 0x1810.
325 // When searching for code point 3 it will search for 0x1800, find 0x1810
326 // and moves to the previous entry. Thus the lower bound value will never
328 // The simple solution is to set the bits belonging to the property and
329 // size. Then the upper bound for code point 3 will return the entry after
330 // 0x1810. After moving to the previous entry the algorithm arrives at the
332 ptrdiff_t __i
= std::ranges::upper_bound(__entries
, (__code_point
<< 11) | 0x7ffu
) - __entries
;
334 return __property::__none
;
337 uint32_t __upper_bound
= (__entries
[__i
] >> 11) + ((__entries
[__i
] >> 2) & 0b1'1111'1111);
338 if (__code_point
<= __upper_bound
)
339 return static_cast<__property
>(__entries
[__i
] & 0b11);
341 return __property::__none
;
344 } // namespace __indic_conjunct_break
346 #endif // _LIBCPP_STD_VER >= 20
348 _LIBCPP_END_NAMESPACE_STD
350 #endif // _LIBCPP___FORMAT_INDIC_CONJUNCT_BREAK_TABLE_H