2 //===----------------------------------------------------------------------===//
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8 //===----------------------------------------------------------------------===//
10 // WARNING, this entire header is generated by
11 // utils/generate_extended_grapheme_cluster_table.py
14 // UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
16 // See Terms of Use <https://www.unicode.org/copyright.html>
17 // for definitions of Unicode Inc.'s Data Files and Software.
19 // NOTICE TO USER: Carefully read the following legal agreement.
20 // BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
21 // DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
22 // YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
23 // TERMS AND CONDITIONS OF THIS AGREEMENT.
24 // IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
25 // THE DATA FILES OR SOFTWARE.
27 // COPYRIGHT AND PERMISSION NOTICE
29 // Copyright (c) 1991-2022 Unicode, Inc. All rights reserved.
30 // Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
32 // Permission is hereby granted, free of charge, to any person obtaining
33 // a copy of the Unicode data files and any associated documentation
34 // (the "Data Files") or Unicode software and any associated documentation
35 // (the "Software") to deal in the Data Files or Software
36 // without restriction, including without limitation the rights to use,
37 // copy, modify, merge, publish, distribute, and/or sell copies of
38 // the Data Files or Software, and to permit persons to whom the Data Files
39 // or Software are furnished to do so, provided that either
40 // (a) this copyright and permission notice appear with all copies
41 // of the Data Files or Software, or
42 // (b) this copyright and permission notice appear in associated
45 // THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
46 // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
47 // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
48 // NONINFRINGEMENT OF THIRD PARTY RIGHTS.
49 // IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
50 // NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
51 // DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
52 // DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
53 // TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
54 // PERFORMANCE OF THE DATA FILES OR SOFTWARE.
56 // Except as contained in this notice, the name of a copyright holder
57 // shall not be used in advertising or otherwise to promote the sale,
58 // use or other dealings in these Data Files or Software without prior
59 // written authorization of the copyright holder.
61 #ifndef _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
62 #define _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H
64 #include <__algorithm/ranges_upper_bound.h>
66 #include <__iterator/access.h>
70 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
71 # pragma GCC system_header
74 _LIBCPP_BEGIN_NAMESPACE_STD
76 #if _LIBCPP_STD_VER >= 20
78 namespace __extended_grapheme_custer_property_boundary
{
80 enum class __property
: uint8_t {
81 // Values generated from the data files.
85 __Extended_Pictographic
,
97 // The properies below aren't stored in the "database".
99 // Text position properties.
103 // The code unit has none of above properties.
107 /// The entries of the extended grapheme cluster bondary property table.
109 /// The data is generated from
110 /// - https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
111 /// - https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
113 /// The data has 3 values
114 /// - bits [0, 3] The property. One of the values generated from the datafiles
115 /// of \ref __property
116 /// - bits [4, 10] The size of the range.
117 /// - bits [11, 31] The lower bound code point of the range. The upper bound of
118 /// the range is lower bound + size.
120 /// The 7 bits for the size allow a maximum range of 128 elements. Some ranges
121 /// in the Unicode tables are larger. They are stored in multiple consecutive
122 /// ranges in the data table. An alternative would be to store the sizes in a
123 /// separate 16-bit value. The original MSVC STL code had such an approach, but
124 /// this approach uses less space for the data and is about 4% faster in the
125 /// following benchmark.
126 /// libcxx/benchmarks/std_format_spec_string_unicode.bench.cpp
127 inline constexpr uint32_t __entries
[1496] = {
1625 /// Returns the extended grapheme cluster bondary property of a code point.
1626 [[nodiscard
]] _LIBCPP_HIDE_FROM_ABI
constexpr __property
__get_property(const char32_t __code_point
) noexcept
{
1627 // The algorithm searches for the upper bound of the range and, when found,
1628 // steps back one entry. This algorithm is used since the code point can be
1629 // anywhere in the range. After a lower bound is found the next step is to
1630 // compare whether the code unit is indeed in the range.
1632 // Since the entry contains a code unit, size, and property the code point
1633 // being sought needs to be adjusted. Just shifting the code point to the
1634 // proper position doesn't work; suppose an entry has property 0, size 1,
1635 // and lower bound 3. This results in the entry 0x1810.
1636 // When searching for code point 3 it will search for 0x1800, find 0x1810
1637 // and moves to the previous entry. Thus the lower bound value will never
1639 // The simple solution is to set the bits belonging to the property and
1640 // size. Then the upper bound for code point 3 will return the entry after
1641 // 0x1810. After moving to the previous entry the algorithm arrives at the
1643 ptrdiff_t __i
= std::ranges::upper_bound(__entries
, (__code_point
<< 11) | 0x7ffu
) - __entries
;
1645 return __property::__none
;
1648 uint32_t __upper_bound
= (__entries
[__i
] >> 11) + ((__entries
[__i
] >> 4) & 0x7f);
1649 if (__code_point
<= __upper_bound
)
1650 return static_cast<__property
>(__entries
[__i
] & 0xf);
1652 return __property::__none
;
1655 } // namespace __extended_grapheme_custer_property_boundary
1657 #endif //_LIBCPP_STD_VER >= 20
1659 _LIBCPP_END_NAMESPACE_STD
1661 #endif // _LIBCPP___FORMAT_EXTENDED_GRAPHEME_CLUSTER_TABLE_H