Avoid potential negative array index access to cached text.
[LibreOffice.git] / include / i18nlangtag / languagetag.hxx
blob39982148bcbb888ab637fe0f8bb8b6f7744588f9
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
10 #ifndef INCLUDED_I18NLANGTAG_LANGUAGETAG_HXX
11 #define INCLUDED_I18NLANGTAG_LANGUAGETAG_HXX
13 #include <sal/config.h>
14 #include <rtl/locale.h>
15 #include <rtl/ustring.hxx>
16 #include <com/sun/star/lang/Locale.hpp>
17 #include <i18nlangtag/i18nlangtagdllapi.h>
18 #include <i18nlangtag/lang.h>
20 #include <memory>
21 #include <string_view>
22 #include <vector>
24 /** The ISO 639-2 code reserved for local use used to indicate that a
25 css::Locale contains a BCP 47 string in its Variant field. The
26 Locale's Language field then will contain this language code.
28 @see LanguageTag::getLocale()
30 Avoid use, only needed internally or if conversion from Locale to
31 LanguageTag is not wanted, i.e. during ODF import. To check whether a
32 LanguageTag contains a plain language/country combination or a more
33 detailed BCP 47 language tag use LanguageTag::isIsoLocale() instead.
35 #define I18NLANGTAG_QLT_ASCII "qlt"
36 inline constexpr OUString I18NLANGTAG_QLT = u"qlt"_ustr;
39 class LanguageTagImpl;
42 /** Wrapper for liblangtag BCP 47 language tags, MS-LangIDs, locales and
43 conversions in between.
45 Note that member variables are mutable and may change their values even in
46 const methods. Getter methods return either the original value or matching
47 converted values.
49 For standalone conversions if no LanguageTag instance is at hand, static
50 convertTo...() methods exist.
52 class SAL_WARN_UNUSED I18NLANGTAG_DLLPUBLIC LanguageTag
54 friend class LanguageTagImpl;
56 public:
58 /** ScriptType for a language.
60 Used only in onTheFly languages as a way of marking key script behaviours
61 for the script of the language without having to store and analyse the
62 script each time. Used primarily from msLangId.
64 These need to correspond to the ExtraLanguages.ScriptType template
65 property in officecfg/registry/schema/org/openoffice/VCL.xcs
67 enum class ScriptType
69 UNKNOWN = 0,
70 WESTERN = 1, // Copies css::i18n::ScriptType for strong types
71 CJK = 2,
72 CTL = 3,
73 RTL = 4 // implies CTL
76 /** Init LanguageTag with existing BCP 47 language tag string.
78 @param bCanonicalize
79 If TRUE, canonicalize tag and reparse, the resulting tag string may
80 be different.
81 IF FALSE, the tag is simply stored and can be retrieved with
82 getBcp47().
84 Note that conversions to ISO codes, locales or LanguageType or
85 obtaining language or script will canonicalize the tag string anyway,
86 so specifying bCanonicalize=false is not a guarantee that the tag will
87 stay identical to what was passed.
89 explicit LanguageTag( const OUString & rBcp47LanguageTag, bool bCanonicalize = false );
91 /** Init LanguageTag with Locale. */
92 explicit LanguageTag( const css::lang::Locale & rLocale );
94 /** Init LanguageTag with LanguageType MS-LangID. */
95 explicit LanguageTag( LanguageType nLanguage );
97 /** Init LanguageTag with either BCP 47 language tag (precedence if not
98 empty), or a combination of language, script and country.
100 This is a convenience ctor to be used in ODF import where these are
101 distinct attributes.
103 explicit LanguageTag( const OUString& rBcp47, const OUString& rLanguage,
104 std::u16string_view rScript, const OUString& rCountry );
106 /** Init LanguageTag with rtl_Locale.
108 This is a convenience ctor.
110 explicit LanguageTag( const rtl_Locale & rLocale );
112 ~LanguageTag();
114 LanguageTag(LanguageTag const &) = default;
115 LanguageTag(LanguageTag &&) = default;
116 LanguageTag & operator =(LanguageTag const &) = default;
117 LanguageTag & operator =(LanguageTag &&) = default;
119 /** Obtain BCP 47 language tag.
121 @param bResolveSystem
122 If TRUE, resolve an empty language tag denoting the system
123 locale to the real locale used.
124 If FALSE, return an empty OUString for such a tag.
126 const OUString & getBcp47( bool bResolveSystem = true ) const;
128 /** Obtain BCP 47 language tag, but with MS malformed exceptions.
130 To be used *only* in OOXML filter context.
131 For example, es-ES-u-co-trad is stored as es-ES_tradnl which is not a
132 valid BCP 47 language tag.
134 OUString getBcp47MS() const;
136 /** Obtain language tag as Locale.
138 As a convention, language tags that can not be expressed as "pure"
139 css::lang::Locale content using Language and Country fields
140 store "qlt" (ISO 639 reserved for local use) in the Language field and
141 the entire BCP 47 language tag in the Variant field. The Country field
142 contains the corresponding ISO 3166 country code _if_ there is one, or
143 otherwise is empty.
145 @param bResolveSystem
146 If TRUE, resolve an empty language tag denoting the system
147 locale to the real locale used.
148 If FALSE, return an empty Locale for such a tag.
150 const css::lang::Locale & getLocale( bool bResolveSystem = true ) const;
152 /** Obtain mapping to MS-LangID.
154 @param bResolveSystem
155 If TRUE, resolve an empty language tag denoting the system
156 locale to the real locale used.
157 If FALSE, return LANGUAGE_SYSTEM for such a tag.
159 LanguageType getLanguageType( bool bResolveSystem = true ) const;
161 /** Obtain ISO strings for language, script and country.
163 This is a convenience method for ODF export places only. Avoid use in
164 other code.
166 ATTENTION! May return empty strings if the language tag is not
167 expressible in valid ISO codes!
169 @see isIsoODF()
171 Always resolves an empty tag to the system locale.
173 void getIsoLanguageScriptCountry( OUString& rLanguage,
174 OUString& rScript, OUString& rCountry ) const;
176 /** Get ISO 639 language code, or BCP 47 language.
178 Always resolves an empty tag to the system locale.
180 OUString getLanguage() const;
182 /** Get ISO 15924 script code, if not the default script according to
183 BCP 47. For default script an empty string is returned.
185 @see hasScript()
187 Always resolves an empty tag to the system locale.
189 OUString getScript() const;
191 /** Get combined language and script code, separated by '-' if
192 non-default script, if default script only language.
194 @see hasScript()
196 Always resolves an empty tag to the system locale.
198 OUString getLanguageAndScript() const;
200 /** Get ISO 3166 country alpha code. Empty if the BCP 47 tags denote a
201 region not expressible as 2 character country code.
203 Always resolves an empty tag to the system locale.
205 OUString getCountry() const;
207 /** Get BCP 47 variant subtags, of the IANA Language Subtag Registry.
209 If there are multiple variant subtags they are separated by '-'.
211 This is NOT related to Locale.Variant!
213 Always resolves an empty tag to the system locale.
215 OUString getVariants() const;
217 /** Get a GLIBC locale string.
219 Always resolves an empty tag to the system locale.
221 @param rEncoding
222 An encoding to be appended to language_country, for example
223 ".UTF-8" including the dot.
225 @return The resulting GLIBC locale string if it could be constructed,
226 if not an empty string is returned.
228 OUString getGlibcLocaleString( std::u16string_view rEncoding ) const;
230 /** If language tag has a non-default script specified.
232 bool hasScript() const;
234 /** If language tag is a locale that can be expressed using only ISO 639
235 language codes and ISO 3166 country codes, thus is convertible to a
236 conforming Locale struct without using extension mechanisms.
238 Note that an empty language tag or empty Locale::Language field or
239 LanguageType LANGUAGE_SYSTEM could be treated as a valid ISO locale in
240 some context, but here is not. If you want that ask for
241 aTag.isSystemLocale() || aTag.isIsoLocale()
243 Always resolves an empty tag to the system locale.
245 bool isIsoLocale() const;
247 /** If language tag is a locale that can be expressed using only ISO 639
248 language codes and ISO 15924 script codes and ISO 3166 country codes,
249 thus can be stored in an ODF document using only fo:language, fo:script
250 and fo:country attributes. If this is FALSE, the locale must be stored
251 as a <*:rfc-language-tag> element.
253 Always resolves an empty tag to the system locale.
255 bool isIsoODF() const;
257 /** If this is a valid BCP 47 language tag.
259 Always resolves an empty tag to the system locale.
261 @seealso static bool isValidBcp47(const OUString&)
263 bool isValidBcp47() const;
265 /** If this tag was constructed as an empty tag denoting the system locale.
267 bool isSystemLocale() const { return mbSystemLocale;}
269 /** Returns the script type for this language, UNKNOWN if not set */
270 ScriptType getScriptType() const;
272 /** Sets the script type for this language */
273 void setScriptType(ScriptType st);
275 /** Reset with existing BCP 47 language tag string. See ctor. */
276 LanguageTag & reset( const OUString & rBcp47LanguageTag );
278 /** Reset with Locale. */
279 LanguageTag & reset( const css::lang::Locale & rLocale );
281 /** Reset with LanguageType MS-LangID. */
282 LanguageTag & reset( LanguageType nLanguage );
285 /** Fall back to a known locale.
287 If the current tag does not represent a known (by us) locale, fall back
288 to the most likely locale possible known.
289 If the current tag is known, no change occurs.
291 LanguageTag & makeFallback();
293 /** Return a vector of fall-back strings.
295 In order:
296 full BCP 47 tag, same as getBcp47()
297 lll-Ssss-CC
298 lll-Ssss
299 lll-CC
302 If the tag includes variants the order is:
303 full BCP 47 tag, same as getBcp47()
304 lll-Ssss-CC-vvvvvvvv
305 lll-Ssss-vvvvvvvv
306 lll-Ssss-CC
307 lll-Ssss
308 lll-CC-vvvvvvvv
309 lll-vvvvvvvv
310 lll-CC
313 Only strings that differ from a higher order are included, for example
314 if there is no script the elements will be bcp47, lll-CC, lll; if the
315 bcp47 string is identical to lll-CC then only lll-CC, lll.
317 Note that lll is only ISO 639-1/2 alpha code and CC is only ISO 3166
318 alpha code. If the region can not be expressed as ISO 3166 then no -CC
319 tags are included.
321 @param bIncludeFullBcp47
322 If TRUE, the full BCP 47 tag is included as first element.
323 If FALSE, the full tag is not included; used if the caller
324 obtains the fallbacks only if the full tag did not lead to a
325 match, so subsequent tries need not to include it again.
327 ::std::vector< OUString > getFallbackStrings( bool bIncludeFullBcp47 ) const;
330 /** @short Search for an equal or at least for a similar locale in a list
331 of possible ones.
333 @descr First search for a locale that is equal to the reference
334 locale. (means: same BCP47 string)
336 If the reference locale could not be located, check for
337 "similar" locales, in the same order as obtained by
338 getFallbackStrings().
340 If no similar locale could be located, we search for a locale
341 "en-US" inside the given locale list.
343 If "en-US" could not be located, we search for a locale "en"
344 inside the given list.
346 If no "same" nor any "similar" locale could be found, we try
347 "x-default" and "x-no-translate" explicitly. Sometimes
348 variables don't use real localization. For example, in case the
349 localized value is a fix product name.
351 If no locale matched until then, we use any other locale that
352 exists inside the set of given ones, namely the first
353 encountered!
355 @param rList
356 the vector of possible locales as BCP47 strings.
358 @param rReference
359 the reference locale, BCP47 string.
361 @return An iterator that points to the found element inside the given
362 locale list. If no matching locale could be found it points to
363 the beginning of the list.
365 static ::std::vector< OUString >::const_iterator getFallback( const ::std::vector< OUString > & rList,
366 const OUString & rReference );
369 /** @short Search for an equal or for a similar locale in a list
370 of possible ones where at least the language matches.
372 @descr First search for a locale that is equal to the reference
373 locale.
375 If the reference locale could not be located, check for
376 "similar" locales, in the same order as obtained by
377 getFallbackStrings().
379 If no locale matches, rList.end() is returned.
381 @param rList
382 the vector of possible locales.
384 @param rReference
385 the reference locale.
387 @return An iterator that points to the found element inside the given
388 locale list. If no matching locale could be found it points to
389 the end of the list.
391 static ::std::vector< css::lang::Locale >::const_iterator getMatchingFallback(
392 const ::std::vector< css::lang::Locale > & rList,
393 const css::lang::Locale & rReference );
396 /** Test equality of two LanguageTag, possibly resolving system locale.
398 Resolve empty language tags denoting the system
399 locale to the real locale used before comparing.
401 bool equals( const LanguageTag & rLanguageTag ) const;
403 /** Test equality of two LanguageTag.
405 Does NOT resolve system, i.e. if the system locale is en-US
406 LanguageTag("")==LanguageTag("en-US") returns false! Use
407 equals(...) instead if system locales shall be resolved.
409 bool operator==( const LanguageTag & rLanguageTag ) const;
411 /** Test inequality of two LanguageTag.
413 Does NOT resolve system, i.e. if the system locale is en-US
414 LanguageTag("")!=LanguageTag("en-US") returns true! Use
415 !equals(,..) instead if system locales shall be resolved.
417 bool operator!=( const LanguageTag & rLanguageTag ) const;
419 /** Test this LanguageTag less than that LanguageTag.
421 For sorted containers. Does NOT resolve system.
423 bool operator<( const LanguageTag & rLanguageTag ) const;
425 /** Convert MS-LangID to Locale.
427 @param bResolveSystem
428 If TRUE, resolve an empty language tag denoting the system
429 locale to the real locale used.
430 If FALSE, return an empty Locale for such a tag.
432 static css::lang::Locale convertToLocale( LanguageType nLangID, bool bResolveSystem = true );
434 /** Convert Locale to MS-LangID.
436 @param bResolveSystem
437 If TRUE, resolve an empty language tag denoting the system
438 locale to the real locale used.
439 If FALSE, return LANGUAGE_SYSTEM for such a tag.
441 static LanguageType convertToLanguageType( const css::lang::Locale& rLocale, bool bResolveSystem = true );
443 /** Convert MS-LangID to BCP 47 string.
445 Resolve an empty language tag denoting the system
446 locale to the real locale used.
448 static OUString convertToBcp47( LanguageType nLangID );
450 /** Convert Locale to BCP 47 string.
452 @param bResolveSystem
453 If TRUE, resolve an empty language tag denoting the system
454 locale to the real locale used.
455 If FALSE, return an empty OUString for such a tag.
457 static OUString convertToBcp47( const css::lang::Locale& rLocale, bool bResolveSystem = true );
459 /** Convert BCP 47 string to Locale, convenience method.
461 NOTE: exists only for consistency with the other convertTo...()
462 methods, internally uses a temporary LanguageTag instance for
463 conversion so does not save anything compared to
464 LanguageTag(rBcp47).getLocale(bResolveSystem).
466 @param bResolveSystem
467 If TRUE, resolve an empty language tag denoting the system
468 locale to the real locale used.
469 If FALSE, return an empty Locale for such a tag.
471 static css::lang::Locale convertToLocale( const OUString& rBcp47, bool bResolveSystem = true );
473 /** Convert BCP 47 string to MS-LangID, convenience method.
475 NOTE: exists only for consistency with the other convertTo...()
476 methods, internally uses a temporary LanguageTag instance for
477 conversion so does not save anything compared to
478 LanguageTag(rBcp47).getLanguageType(bResolveSystem).
480 Resolve an empty language tag denoting the system
481 locale to the real locale used.
483 static LanguageType convertToLanguageType( const OUString& rBcp47 );
485 /** Convert BCP 47 string to MS-LangID with fallback, convenience method.
487 NOTE: exists only for consistency with the other convertTo...()
488 methods, internally uses a temporary LanguageTag instance for
489 conversion so does not save anything compared to
490 LanguageTag(rBcp47).makeFallback().getLanguageType(bResolveSystem).
492 @see makeFallback()
494 Always resolves an empty tag to the system locale.
496 static LanguageType convertToLanguageTypeWithFallback( const OUString& rBcp47 );
498 /** Convert BCP 47 string to Locale with fallback, convenience method.
500 NOTE: exists only for consistency with the other convertTo...()
501 methods, internally uses a temporary LanguageTag instance for
502 conversion so does not save anything compared to
503 LanguageTag(rBcp47).makeFallback().getLocale(bResolveSystem).
505 @see makeFallback()
507 Always resolves an empty tag to the system locale.
509 static css::lang::Locale convertToLocaleWithFallback( const OUString& rBcp47 );
511 /** Convert Locale to MS-LangID with fallback.
513 Resolves an empty language tag denoting the system
514 locale to LANGUAGE_SYSTEM and does not fallback.
516 static LanguageType convertToLanguageTypeWithFallback( const css::lang::Locale& rLocale );
518 /** Enums to be used with isValidBcp47(). */
519 enum PrivateUse
521 ALLOW = 0, ///< Allow all private-use and local-use including (!) 'qlt' local-use.
522 DISALLOW, ///< Disallow all private-use and 'qlt' local-use, other 'qaa' to 'qtz' local-use are allowed.
523 ALLOW_ART_X ///< Disallow all private-use and 'qlt' local-use, but allow 'art-x-...' private-use
524 /// for artificial constructed languages (and 'art-Latn-x-...' and other scripts).
527 /** If rString represents a valid BCP 47 language tag.
529 Never resolves an empty tag to the system locale, in fact an empty
530 string is invalid here. Does not create an instance to be registered
531 with a conversion to Locale or LanguageType.
533 @param o_pCanonicalized
534 If given and rString is a valid BCP 47 language tag, the
535 canonicalized form is assigned, which may differ from the
536 original string even if that was a valid tag. If rString is not
537 a valid tag, nothing is assigned.
539 @param ePrivateUse
540 If PrivateUse::DISALLOW, valid tags according to BCP 47 but
541 reserved for private use, like 'x-...', are not allowed and
542 FALSE is returned in this case.
544 static bool isValidBcp47( const OUString& rString, OUString* o_pCanonicalized,
545 PrivateUse ePrivateUse = PrivateUse::ALLOW );
547 /** If nLang is a generated on-the-fly LangID */
548 static bool isOnTheFlyID( LanguageType nLang );
549 static ScriptType getOnTheFlyScriptType( LanguageType nLang );
551 /** @ATTENTION: _ONLY_ to be called by the application's configuration! */
552 static void setConfiguredSystemLanguage( LanguageType nLang );
554 /** @ATTENTION: _ONLY_ to be called by fuzzing setup */
555 static void disable_lt_tag_parse();
557 typedef std::shared_ptr< LanguageTagImpl > ImplPtr;
559 private:
561 mutable css::lang::Locale maLocale;
562 mutable OUString maBcp47;
563 mutable LanguageType mnLangID;
564 mutable ImplPtr mpImpl;
565 bool mbSystemLocale : 1;
566 mutable bool mbInitializedBcp47 : 1;
567 mutable bool mbInitializedLocale : 1;
568 mutable bool mbInitializedLangID : 1;
569 bool mbIsFallback : 1;
571 LanguageTagImpl* getImpl();
572 LanguageTagImpl const* getImpl() const;
573 ImplPtr registerImpl() const;
574 void syncFromImpl();
575 void syncVarsFromRawImpl() const;
576 void syncVarsFromImpl() const;
578 void convertLocaleToLang();
579 void convertBcp47ToLocale();
580 void convertBcp47ToLang();
581 void convertLangToLocale();
583 void convertFromRtlLocale();
585 /** Canonicalize if not yet done and synchronize initialized conversions.
587 @return whether BCP 47 language tag string was changed.
589 bool synCanonicalize();
591 void resetVars();
593 static bool isIsoLanguage( const OUString& rLanguage );
594 static bool isIsoScript( const OUString& rScript );
595 static bool isIsoCountry( const OUString& rRegion );
599 #endif // INCLUDED_I18NLANGTAG_LANGUAGETAG_HXX
601 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */