usr/src/man/man3c/u8_textprep_str.3c

   1 '\" te
   2 .\" Copyright (c) 2007, Sun Microsystems Inc. All Rights Reserved.
   3 .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License").  You may not use this file except in compliance with the License.
   4 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing.  See the License for the specific language governing permissions and limitations under the License.
   5 .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
   6 .TH U8_TEXTPREP_STR 3C "Sep 18, 2007"
   7 .SH NAME
   8 u8_textprep_str \- string-based UTF-8 text preparation function
   9 .SH SYNOPSIS
  10 .LP
  11 .nf
  12 #include <sys/u8_textprep.h>
  13
  14 \fBsize_t\fR \fBu8_textprep_str\fR(\fBchar *\fR\fIinarray\fR, \fBsize_t *\fR\fIinlen\fR,
  15      \fBchar *\fR\fIoutarray\fR, \fBsize_t *\fR\fIoutlen\fR, \fBint\fR \fIflag\fR,
  16      \fBsize_t\fR \fIunicode_version\fR, \fBint *\fR\fIerrnum\fR);
  17 .fi
  18
  19 .SH PARAMETERS
  20 .sp
  21 .ne 2
  22 .na
  23 \fB\fIinarray\fR\fR
  24 .ad
  25 .RS 20n
  26 A pointer to a byte array containing a sequence of UTF-8 character bytes to be
  27 prepared.
  28 .RE
  29
  30 .sp
  31 .ne 2
  32 .na
  33 \fB\fIinlen\fR\fR
  34 .ad
  35 .RS 20n
  36 As input argument, the number of bytes to be prepared in \fIinarray\fR. As
  37 output argument, the number of bytes in \fIinarray\fR still not consumed.
  38 .RE
  39
  40 .sp
  41 .ne 2
  42 .na
  43 \fB\fIoutarray\fR\fR
  44 .ad
  45 .RS 20n
  46 A pointer to a byte array where prepared UTF-8 character bytes can be saved.
  47 .RE
  48
  49 .sp
  50 .ne 2
  51 .na
  52 \fB\fIoutlen\fR\fR
  53 .ad
  54 .RS 20n
  55 As input argument, the number of available bytes at \fIoutarray\fR where
  56 prepared character bytes can be saved.  As output argument, after the
  57 conversion, the number of bytes still available at \fIoutarray\fR.
  58 .RE
  59
  60 .sp
  61 .ne 2
  62 .na
  63 \fB\fIflag\fR\fR
  64 .ad
  65 .RS 20n
  66 The possible preparation options constructed by a bitwise-inclusive-OR of the
  67 following values:
  68 .sp
  69 .ne 2
  70 .na
  71 \fB\fBU8_TEXTPREP_IGNORE_NULL\fR\fR
  72 .ad
  73 .sp .6
  74 .RS 4n
  75 Normally \fBu8_textprep_str()\fR stops the preparation if it encounters null
  76 byte even if the current \fIinlen\fR is pointing to a value bigger than zero.
  77 .sp
  78 With this option, null byte does not stop the preparation and the preparation
  79 continues until \fIinlen\fR specified amount of \fIinarray\fR bytes are all
  80 consumed for preparation or an error happened.
  81 .RE
  82
  83 .sp
  84 .ne 2
  85 .na
  86 \fB\fBU8_TEXTPREP_IGNORE_INVALID\fR\fR
  87 .ad
  88 .sp .6
  89 .RS 4n
  90 Normally \fBu8_textprep_str()\fR stops the preparation if it encounters illegal
  91 or incomplete characters with corresponding \fIerrnum\fR values.
  92 .sp
  93 When this option is set, \fBu8_textprep_str()\fR does not stop the preparation
  94 and instead treats such characters as no need to do any preparation.
  95 .RE
  96
  97 .sp
  98 .ne 2
  99 .na
 100 \fB\fBU8_TEXTPREP_TOUPPER\fR\fR
 101 .ad
 102 .sp .6
 103 .RS 4n
 104 Map lowercase characters to uppercase characters if applicable.
 105 .RE
 106
 107 .sp
 108 .ne 2
 109 .na
 110 \fB\fBU8_TEXTPREP_TOLOWER\fR\fR
 111 .ad
 112 .sp .6
 113 .RS 4n
 114 Map uppercase characters to lowercase characters if applicable.
 115 .RE
 116
 117 .sp
 118 .ne 2
 119 .na
 120 \fB\fBU8_TEXTPREP_NFD\fR\fR
 121 .ad
 122 .sp .6
 123 .RS 4n
 124 Apply Unicode Normalization Form D.
 125 .RE
 126
 127 .sp
 128 .ne 2
 129 .na
 130 \fB\fBU8_TEXTPREP_NFC\fR\fR
 131 .ad
 132 .sp .6
 133 .RS 4n
 134 Apply Unicode Normalization Form C.
 135 .RE
 136
 137 .sp
 138 .ne 2
 139 .na
 140 \fB\fBU8_TEXTPREP_NFKD\fR\fR
 141 .ad
 142 .sp .6
 143 .RS 4n
 144 Apply Unicode Normalization Form KD.
 145 .RE
 146
 147 .sp
 148 .ne 2
 149 .na
 150 \fB\fBU8_TEXTPREP_NFKC\fR\fR
 151 .ad
 152 .sp .6
 153 .RS 4n
 154 Apply Unicode Normalization Form KC.
 155 .RE
 156
 157 Only one case folding option is allowed. Only one Unicode Normalization option
 158 is allowed.
 159 .sp
 160 When a case folding option and a Unicode Normalization option are specified
 161 together, UTF-8 text preparation is done by doing case folding first and then
 162 Unicode Normalization.
 163 .sp
 164 If no option is specified, no processing occurs except the simple copying of
 165 bytes from input to output.
 166 .RE
 167
 168 .sp
 169 .ne 2
 170 .na
 171 \fB\fIunicode_version\fR\fR
 172 .ad
 173 .RS 20n
 174 The version of Unicode data that should be used during UTF-8 text preparation.
 175 The following values are supported:
 176 .sp
 177 .ne 2
 178 .na
 179 \fB\fBU8_UNICODE_320\fR\fR
 180 .ad
 181 .sp .6
 182 .RS 4n
 183 Use Unicode 3.2.0 data during comparison.
 184 .RE
 185
 186 .sp
 187 .ne 2
 188 .na
 189 \fB\fBU8_UNICODE_500\fR\fR
 190 .ad
 191 .sp .6
 192 .RS 4n
 193 Use Unicode 5.0.0 data during comparison.
 194 .RE
 195
 196 .sp
 197 .ne 2
 198 .na
 199 \fB\fBU8_UNICODE_LATEST\fR\fR
 200 .ad
 201 .sp .6
 202 .RS 4n
 203 Use the latest Unicode version data available which is Unicode 5.0.0 currently.
 204 .RE
 205
 206 .RE
 207
 208 .sp
 209 .ne 2
 210 .na
 211 \fB\fIerrnum\fR\fR
 212 .ad
 213 .RS 20n
 214 The error value when preparation is not completed or fails. The following
 215 values are supported:
 216 .sp
 217 .ne 2
 218 .na
 219 \fB\fBE2BIG\fR\fR
 220 .ad
 221 .RS 10n
 222 Text preparation stopped due to lack of space in the output array.
 223 .RE
 224
 225 .sp
 226 .ne 2
 227 .na
 228 \fB\fBEBADF\fR\fR
 229 .ad
 230 .RS 10n
 231 Specified option values are conflicting and cannot be supported.
 232 .RE
 233
 234 .sp
 235 .ne 2
 236 .na
 237 \fB\fBEILSEQ\fR\fR
 238 .ad
 239 .RS 10n
 240 Text preparation stopped due to an input byte that does not belong to UTF-8.
 241 .RE
 242
 243 .sp
 244 .ne 2
 245 .na
 246 \fB\fBEINVAL\fR\fR
 247 .ad
 248 .RS 10n
 249 Text preparation stopped due to an incomplete UTF-8 character at the end of the
 250 input array.
 251 .RE
 252
 253 .sp
 254 .ne 2
 255 .na
 256 \fB\fBERANGE\fR\fR
 257 .ad
 258 .RS 10n
 259 The specified Unicode version value is not a supported version.
 260 .RE
 261
 262 .RE
 263
 264 .SH DESCRIPTION
 265 .sp
 266 .LP
 267 The \fBu8_textprep_str()\fR function prepares the sequence of UTF-8 characters
 268 in the array specified by \fIinarray\fR into a sequence of corresponding UTF-8
 269 characters prepared in the array specified by \fIoutarray\fR. The \fIinarray\fR
 270 argument points to a character byte array to the first character in the input
 271 array and \fIinlen\fR indicates the number of bytes to the end of the array to
 272 be converted. The \fIoutarray\fR argument points to a character byte array to
 273 the first available byte in the output array and \fIoutlen\fR indicates the
 274 number of the available bytes to the end of the array. Unless \fIflag\fR is
 275 \fBU8_TEXTPREP_IGNORE_NULL\fR, \fBu8_textprep_str()\fR normally stops when it
 276 encounters a null byte from the input array regardless of the current
 277 \fIinlen\fR value.
 278 .sp
 279 .LP
 280 If \fIflag\fR is \fBU8_TEXTPREP_IGNORE_INVALID\fR and a sequence of input bytes
 281 does not form a valid UTF-8 character, preparation stops after the previous
 282 successfully prepared character. If \fIflag\fR is
 283 \fBU8_TEXTPREP_IGNORE_INVALID\fR and the input array ends with an incomplete
 284 UTF-8 character, preparation stops after the previous successfully prepared
 285 bytes. If the output array is not large enough to hold the entire prepared
 286 text, preparation stops just prior to the input bytes that would cause the
 287 output array to overflow. The value pointed to by \fIinlen\fR is decremented to
 288 reflect the number of bytes still not prepared in the input array. The value
 289 pointed to by \fIoutlen\fR is decremented to reflect the number of bytes still
 290 available in the output array.
 291 .SH RETURN VALUES
 292 .sp
 293 .LP
 294 The \fBu8_textprep_str()\fR function updates the values pointed to by
 295 \fIinlen\fR and \fIoutlen\fR arguments to reflect the extent of the
 296 preparation. When \fBU8_TEXTPREP_IGNORE_INVALID\fR is specified,
 297 \fBu8_textprep_str()\fR returns the number of illegal or incomplete characters
 298 found during the text preparation. When \fBU8_TEXTPREP_IGNORE_INVALID\fR is not
 299 specified and the text preparation is entirely successful, the function returns
 300 0. If the entire string in the input array is prepared, the value pointed to by
 301 \fIinlen\fR will be 0. If the text preparation is stopped due to any conditions
 302 mentioned above, the value pointed to by \fIinlen\fR will be non-zero and
 303 \fIerrnum\fR is set to indicate the error. If such and any other error occurs,
 304 \fBu8_textprep_str()\fR returns (\fBsize_t\fR)-1 and sets \fIerrnum\fR to
 305 indicate the error.
 306 .SH EXAMPLES
 307 .LP
 308 \fBExample 1 \fRSimple UTF-8 text preparation
 309 .sp
 310 .in +2
 311 .nf
 312 #include <sys/u8_textprep.h>
 313 \&.
 314 \&.
 315 \&.
 316 size_t ret;
 317 char ib[MAXPATHLEN];
 318 char ob[MAXPATHLEN];
 319 size_t il, ol;
 320 int err;
 321 \&.
 322 \&.
 323 \&.
 324 /*
 325  * We got a UTF-8 pathname from somewhere.
 326  *
 327  * Calculate the length of input string including the terminating
 328  * NULL byte and prepare other arguments.
 329  */
 330 (void) strlcpy(ib, pathname, MAXPATHLEN);
 331 il = strlen(ib) + 1;
 332 ol = MAXPATHLEN;
 333
 334 /*
 335  * Do toupper case folding, apply Unicode Normalization Form D,
 336  * ignore NULL byte, and ignore any illegal/incomplete characters.
 337  */
 338 ret = u8_textprep_str(ib, &il, ob, &ol,
 339     (U8_TEXTPREP_IGNORE_NULL|U8_TEXTPREP_IGNORE_INVALID|
 340     U8_TEXTPREP_TOUPPER|U8_TEXTPREP_NFD), U8_UNICODE_LATEST, &err);
 341 if (ret == (size_t)-1) {
 342     if (err == E2BIG)
 343         return (-1);
 344     if (err == EBADF)
 345         return (-2);
 346     if (err == ERANGE)
 347         return (-3);
 348     return (-4);
 349 }
 350 .fi
 351 .in -2
 352
 353 .SH ATTRIBUTES
 354 .sp
 355 .LP
 356 See \fBattributes\fR(5) for descriptions of the following attributes:
 357 .sp
 358
 359 .sp
 360 .TS
 361 box;
 362 c | c
 363 l | l .
 364 ATTRIBUTE TYPE  ATTRIBUTE VALUE
 365 _
 366 Interface Stability     Committed
 367 _
 368 MT-Level        MT-Safe
 369 .TE
 370
 371 .SH SEE ALSO
 372 .sp
 373 .LP
 374 \fBu8_strcmp\fR(3C), \fBu8_validate\fR(3C), \fBattributes\fR(5),
 375 \fBu8_strcmp\fR(9F), \fBu8_textprep_str\fR(9F), \fBu8_validate\fR(9F)
 376 .sp
 377 .LP
 378 The Unicode Standard (http://www.unicode.org)
 379 .SH NOTES
 380 .sp
 381 .LP
 382 After the text preparation, the number of prepared UTF-8 characters and the
 383 total number bytes may decrease or increase when you compare the numbers with
 384 the input buffer.
 385 .sp
 386 .LP
 387 Case conversions are performed using Unicode data of the corresponding version.
 388 There are no locale-specific case conversions that can be performed.