usr/src/cmd/sort/common/fields.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25
  26 #include "fields.h"
  27
  28 /*
  29  * fields
  30  *
  31  * Overview
  32  *   By a field, we mean the various delimited character sequences within each
  33  *   line of the input files.  The sort key consists of an ordered sequence of
  34  *   fields, which need not include all possible fields for the given line.
  35  *   (Furthermore, not every line need contain sufficient fields for the fields
  36  *   given within the sort key.  In fact, none of the lines in the input stream
  37  *   need contain sufficient fields.)
  38  *
  39  *   There are two methods for specifying fields for sort(1); these are
  40  *   discussed in options.c.  Here we discuss only the internal representation
  41  *   of fields, as used for constructing the collation vector for each line as
  42  *   defined by the sort key.
  43  *
  44  * Representation
  45  *   The sort key is a singly-linked list of field specifiers.  At present,
  46  *   fields may belong to one of three species:  alphabetical, numerical, or
  47  *   monthly; the species (f_species) then indicates the conversion function
  48  *   (f_convert) used to transform the raw characters of the character sequence
  49  *   to a collatable form.  (In principle, this allows us to consider future
  50  *   field species such as hexadecimal.)
  51  *
  52  *   Fields and offsets are numbered such that zero refers to the first field or
  53  *   character, respectively.  Thus, the interpretation of a key specifier, m.n,
  54  *   is that the field begins at the nth character beyond the mth occurence of
  55  *   the key separator.  If the blanks flag has been specified, then the field
  56  *   begins at the nth non-blank character past the mth key separator.  If the
  57  *   key separator is unspecified, then the key separator is defined as one or
  58  *   more blank characters.
  59  *
  60  *   In general, the various options afforded by sort may be broken into two
  61  *   categories:  field species and field modifiers.  For each field species,
  62  *   there is one or more conversion routines that take a delimited character
  63  *   sequence and convert it to a character sequence collatable by strcmp() or
  64  *   memcmp().  For field species that may be further modified, such as the
  65  *   fold-to-uppercase option for alphabetic fields, the conversion routine may
  66  *   be aware of how the modifier affects collation.  Finally, the no-modifiers
  67  *   case may present an opportunity for a simplified, faster version.
  68  *
  69  * Code Structure
  70  *   The code paths for single-byte and multi-byte locales diverge significantly
  71  *   in fields.c.  Most routines have an *_wide() version, which produces an
  72  *   equivalent effect for line records whose data field is composed of wide
  73  *   characters (wchar_t).  However, the l_collated field of a line record is
  74  *   always composed of characters, so that the radix sorts provided in
  75  *   internal.c can work in both single- and multi-byte locales.  Thus, in the
  76  *   various convert_*_wide() routines, the output is placed in l_collated, with
  77  *   a length multiplier of 4.
  78  */
  79
  80 #define BEFORE_NUMBER   0x0
  81 #define IN_NUMBER       0x1
  82
  83 static char     numerical_separator;
  84 static char     numerical_decimal;
  85 static char     monetary_separator;
  86 static char     monetary_decimal;
  87
  88 static wchar_t  w_numerical_separator;
  89 static wchar_t  w_numerical_decimal;
  90 static wchar_t  w_monetary_separator;
  91 static wchar_t  w_monetary_decimal;
  92
  93 #define MONTHS_IN_YEAR  12
  94 #define MAX_MON_LEN     20
  95
  96 enum { MO_NONE = 1, MO_OFFSET = 2 };
  97
  98 static char     *months[MONTHS_IN_YEAR];
  99 static size_t   month_lengths[MONTHS_IN_YEAR];
 100 static wchar_t  *w_months[MONTHS_IN_YEAR];
 101 static size_t   w_month_lengths[MONTHS_IN_YEAR];
 102
 103 #define DECIMAL_CHAR            (numerical_decimal)
 104 #define IS_BLANK(x)             (isspace((uchar_t)(x)) && (x) != '\n')
 105 #define IS_SEPARATOR(x)         \
 106         ((numerical_separator != '\0' && (x) == numerical_separator) || \
 107         (monetary_separator != '\0' && (x) == monetary_separator))
 108 #define IS_DECIMAL(x)           \
 109         ((x) == numerical_decimal || \
 110         (monetary_decimal != '\0' && (x) == monetary_decimal))
 111 #define W_DECIMAL_CHAR          (w_numerical_decimal)
 112 #define W_IS_BLANK(x)           (iswspace(x) && (x) != L'\n')
 113 #define W_IS_SEPARATOR(x)       \
 114         ((numerical_separator != '\0' && (x) == w_numerical_separator) || \
 115         (monetary_separator != '\0' && (x) == w_monetary_separator))
 116 #define W_IS_DECIMAL(x)         \
 117         (((x) == w_numerical_decimal) || \
 118         (monetary_decimal != '\0' && (x) == w_monetary_decimal))
 119
 120 #define INTERFIELD_SEPARATOR '\0'
 121 #define W_INTERFIELD_SEPARATOR L'\0'
 122
 123 #define INT_SIGN_FLIP_MASK 0x80000000
 124 #define INT_SIGN_PASS_MASK 0x00000000
 125
 126 /*
 127  * strx_ops_t, xfrm_len, and xfrm_cpy:  In the case where we are sorting in the
 128  * C locale, we want to avoid the expense of transforming strings to collatable
 129  * forms since, by definition, an arbitrary string in the C locale is already in
 130  * its collatable form.  Therefore, we construct a small ops vector (the
 131  * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into
 132  * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear
 133  * strxfrm()-like.
 134  */
 135 /*ARGSUSED*/
 136 static size_t
 137 xfrm_len(const char *s2, size_t len)
 138 {
 139         return (strxfrm(NULL, s2, 0) + 1);
 140 }
 141
 142 /*
 143  * The length represented by n includes a null character, so to return the
 144  * correct length we subtract 1.  Note that this function is only used by
 145  * field_convert_alpha, and isn't for general use, as it assumes that n is the
 146  * length of s2 plus a null character.
 147  */
 148 static size_t
 149 C_ncpy(char *s1, const char *s2, size_t n)
 150 {
 151         (void) strncpy(s1, s2, n);
 152         return (n - 1);
 153 }
 154
 155 /*ARGSUSED*/
 156 static size_t
 157 C_len(const char *s, size_t len)
 158 {
 159         ASSERT(s != NULL);
 160         return (len);
 161 }
 162
 163 typedef struct _strx_ops {
 164         size_t  (*sx_len)(const char *, size_t);
 165         size_t  (*sx_xfrm)(char *, const char *, size_t);
 166 } strx_ops_t;
 167
 168 static const strx_ops_t C_ops = { C_len, C_ncpy };
 169 static const strx_ops_t SB_ops = { xfrm_len, strxfrm };
 170
 171 static const strx_ops_t *xfrm_ops;
 172
 173 static void
 174 field_initialize_separator(void)
 175 {
 176         /*
 177          * A locale need not define all of the cases below:  only decimal_point
 178          * must be defined.  Furthermore, sort(1) has traditionally not used the
 179          * positive_sign and negative_sign, grouping, or currency_symbols (or
 180          * their numeric counterparts, if any).
 181          */
 182         struct lconv *conv = localeconv();
 183
 184         if (!xstreql(conv->thousands_sep, "")) {
 185                 numerical_separator = *conv->thousands_sep;
 186                 (void) mbtowc(&w_numerical_separator, conv->thousands_sep,
 187                     MB_CUR_MAX);
 188         } else
 189                 numerical_separator = '\0';
 190
 191         if (!xstreql(conv->mon_thousands_sep, "")) {
 192                 monetary_separator = *conv->mon_thousands_sep;
 193                 (void) mbtowc(&w_monetary_separator, conv->mon_thousands_sep,
 194                     MB_CUR_MAX);
 195         } else
 196                 monetary_separator = '\0';
 197
 198         if (!xstreql(conv->mon_decimal_point, "")) {
 199                 monetary_decimal = *conv->mon_decimal_point;
 200                 (void) mbtowc(&w_monetary_decimal, conv->mon_decimal_point,
 201                     MB_CUR_MAX);
 202         } else
 203                 monetary_decimal = '\0';
 204
 205         numerical_decimal = *conv->decimal_point;
 206         (void) mbtowc(&w_numerical_decimal, conv->decimal_point, MB_CUR_MAX);
 207 }
 208
 209 static void
 210 field_initialize_month(int is_c_locale)
 211 {
 212         int i;
 213         int j;
 214         struct tm this_month;
 215         const char *c_months[MONTHS_IN_YEAR] = {
 216                 "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
 217                 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
 218         };
 219
 220         char month_name[MAX_MON_LEN * MB_LEN_MAX];
 221         wchar_t w_month_name[MAX_MON_LEN];
 222
 223         if (is_c_locale) {
 224                 for (i = 0; i < MONTHS_IN_YEAR; i++) {
 225                         months[i] = (char *)c_months[i];
 226                         month_lengths[i] = strlen(c_months[i]);
 227                 }
 228                 /*
 229                  * We don't need to initialize the wide version of the month
 230                  * names.
 231                  */
 232                 return;
 233         }
 234
 235         (void) memset(&this_month, 0, sizeof (this_month));
 236
 237         for (i = 0; i < MONTHS_IN_YEAR; i++) {
 238                 this_month.tm_mon = i;
 239
 240                 (void) strftime(month_name, sizeof (month_name),
 241                     "%b", &this_month);
 242
 243                 for (j = 0; j < strlen(month_name); j++)
 244                         month_name[j] = toupper(month_name[j]);
 245                 (void) mbstowcs(w_month_name, month_name, MAX_MON_LEN);
 246
 247                 months[i] = strdup(month_name);
 248                 month_lengths[i] = strlen(month_name);
 249                 w_months[i] = wsdup(w_month_name);
 250                 w_month_lengths[i] = wslen(w_month_name);
 251         }
 252 }
 253
 254 void
 255 field_initialize(sort_t *S)
 256 {
 257         field_initialize_month(S->m_c_locale);
 258         field_initialize_separator();
 259
 260         if (S->m_c_locale)
 261                 xfrm_ops = &C_ops;
 262         else
 263                 xfrm_ops = &SB_ops;
 264 }
 265
 266 field_t *
 267 field_new(sort_t *S)
 268 {
 269         field_t *F = safe_realloc(NULL, sizeof (field_t));
 270
 271         F->f_start_field = -1;
 272         F->f_start_offset = -1;
 273         F->f_end_field = -1;
 274         F->f_end_offset = -1;
 275         F->f_next = NULL;
 276
 277         if (S == NULL) {
 278                 F->f_species = ALPHA;
 279                 F->f_options = 0;
 280         } else {
 281                 F->f_species = S->m_default_species;
 282                 F->f_options = S->m_field_options;
 283         }
 284
 285         return (F);
 286 }
 287
 288 void
 289 field_delete(field_t *F)
 290 {
 291         free(F);
 292 }
 293
 294 /*
 295  * The recursive implementation of field_add_to_chain() given below is
 296  * inappropriate if function calls are expensive, or a truly large number of
 297  * fields are anticipated.
 298  */
 299 void
 300 field_add_to_chain(field_t **F, field_t *A)
 301 {
 302         if (*F == NULL)
 303                 *F = A;
 304         else
 305                 field_add_to_chain(&((*F)->f_next), A);
 306 }
 307
 308 #ifdef DEBUG
 309 #ifndef _LP64
 310 #define FIELD_FMT \
 311 "\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n"
 312 #else /* !_LP64 */
 313 #define FIELD_FMT \
 314 "\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n"
 315 #endif /* !_LP64 */
 316
 317 /*
 318  * field_print is used only for debugging purposes.
 319  */
 320 void
 321 field_print(field_t *F)
 322 {
 323         char *field_names[] = {"ALPHA", "MONTH", "NUMERIC"};
 324         int status = 0;
 325
 326         (void) fprintf(stderr, "Type: %s", field_names[F->f_species]);
 327         (void) fprintf(stderr, "\tOptions: ");
 328
 329         if (F->f_options & FIELD_REVERSE_COMPARISONS) {
 330                 (void) fprintf(stderr, "REVERSE");
 331                 status++;
 332         }
 333         if (F->f_options & FIELD_DICTIONARY_ORDER) {
 334                 (void) fprintf(stderr, "DICTIONARY ");
 335                 status++;
 336         }
 337         if (F->f_options & FIELD_FOLD_UPPERCASE) {
 338                 (void) fprintf(stderr, "UPPERCASE ");
 339                 status++;
 340         }
 341         if (F->f_options & FIELD_IGNORE_NONPRINTABLES) {
 342                 (void) fprintf(stderr, "PRINTABLES ");
 343                 status++;
 344         }
 345         if (F->f_options & FIELD_IGNORE_BLANKS_START) {
 346                 (void) fprintf(stderr, "BLANKS_START ");
 347                 status++;
 348         }
 349         if (F->f_options & FIELD_IGNORE_BLANKS_END) {
 350                 (void) fprintf(stderr, "BLANKS_END ");
 351                 status++;
 352         }
 353
 354         if (status == 0)
 355                 (void) fprintf(stderr, "NO_MODIFIERS");
 356
 357         (void) fprintf(stderr, FIELD_FMT, F->f_start_field, F->f_start_offset,
 358             F->f_end_field, F->f_end_offset);
 359 }
 360 #endif /* DEBUG */
 361
 362 static ssize_t
 363 field_boundary(field_t *F, line_rec_t *L, int is_end, int is_blanks)
 364 {
 365         char *S = L->l_data.sp;
 366         char *T = S;
 367         char *eol = S + L->l_data_length;
 368         ssize_t field = is_end ? F->f_end_field : F->f_start_field;
 369         ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
 370         ssize_t ret;
 371
 372         ASSERT(is_end || field > -1);
 373
 374         if (is_end && field == -1)
 375                 return (L->l_data_length);
 376
 377         while (field-- > 0) {
 378                 while (T < eol && IS_BLANK(*T))
 379                         T++;
 380
 381                 while (T < eol && !IS_BLANK(*T))
 382                         T++;
 383         }
 384
 385         if ((!is_end || offset > 0) && is_blanks) {
 386                 while (IS_BLANK(*T))
 387                         T++;
 388         }
 389
 390         if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
 391                 return (L->l_data_length);
 392
 393         return (ret);
 394 }
 395
 396 static void
 397 field_delimit(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
 398 {
 399         ASSERT(F->f_start_field > -1);
 400
 401         *start = field_boundary(F, L, 0,
 402             F->f_options & FIELD_IGNORE_BLANKS_START);
 403         *end = field_boundary(F, L, 1,
 404             F->f_options & FIELD_IGNORE_BLANKS_END);
 405 }
 406
 407 static ssize_t
 408 field_boundary_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks)
 409 {
 410         wchar_t *S = L->l_data.wp;
 411         wchar_t *T = S;
 412         wchar_t *eol = S + L->l_data_length;
 413         ssize_t field = is_end ? F->f_end_field : F->f_start_field;
 414         ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
 415         ssize_t ret;
 416
 417         ASSERT(is_end || field > -1);
 418
 419         if (is_end && field == -1)
 420                 return (L->l_data_length);
 421
 422         while (field-- > 0) {
 423                 while (T < eol && W_IS_BLANK(*T))
 424                         T++;
 425
 426                 while (T < eol && !W_IS_BLANK(*T))
 427                         T++;
 428         }
 429
 430         if ((!is_end || offset > 0) && is_blanks) {
 431                 while (W_IS_BLANK(*T))
 432                         T++;
 433         }
 434
 435         if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length)
 436                 return (L->l_data_length);
 437
 438         return (ret);
 439 }
 440
 441 static void
 442 field_delimit_wide(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end)
 443 {
 444         ASSERT(F->f_start_field > -1);
 445
 446         *start = field_boundary_wide(F, L, 0,
 447             F->f_options & FIELD_IGNORE_BLANKS_START);
 448         *end = field_boundary_wide(F, L, 1,
 449             F->f_options & FIELD_IGNORE_BLANKS_END);
 450 }
 451
 452 static ssize_t
 453 field_boundary_tabbed(field_t *F, line_rec_t *L, int is_end, int is_blanks,
 454     vchar_t delimiter)
 455 {
 456         char *S = L->l_data.sp;
 457         char *T = S;
 458         char *eol = S + L->l_data_length;
 459         ssize_t field = is_end ? F->f_end_field : F->f_start_field;
 460         ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
 461         ssize_t ret;
 462
 463         ASSERT(is_end || field > -1);
 464
 465         if (is_end && field == -1)
 466                 return (L->l_data_length);
 467
 468         while (field-- > 0) {
 469                 T = xstrnchr(T, delimiter.sc, eol - T);
 470                 if (T == NULL || T > eol)
 471                         return (L->l_data_length);
 472
 473                 T++;
 474         }
 475
 476         if ((!is_end || offset != 0) && is_blanks) {
 477                 while (IS_BLANK(*T))
 478                         T++;
 479         }
 480
 481         if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
 482                 if (L->l_data_length <= 0)
 483                         return (0);
 484                 if (S[L->l_data_length - 1] == delimiter.sc) {
 485                         return (L->l_data_length - 1);
 486                 } else {
 487                         return (L->l_data_length);
 488                 }
 489         }
 490
 491         if (is_end && offset == 0)
 492                 ret--;
 493
 494         return (ret);
 495 }
 496
 497 /*
 498  * field_delimit_tabbed() is called when a field separator has been defined
 499  * using the -t option.  The character at the offset, start, is either one or
 500  * more character positions past the delimiter marking the start of the
 501  * field, or at the end of the line.
 502  */
 503 static void
 504 field_delimit_tabbed(field_t *F, line_rec_t *L, ssize_t *start, ssize_t *end,
 505     vchar_t delimiter)
 506 {
 507         ASSERT(F->f_start_field > -1);
 508
 509         *start = field_boundary_tabbed(F, L, 0, F->f_options &
 510             FIELD_IGNORE_BLANKS_START, delimiter);
 511         *end = field_boundary_tabbed(F, L, 1, F->f_options &
 512             FIELD_IGNORE_BLANKS_END, delimiter);
 513 }
 514
 515 static ssize_t
 516 field_boundary_tabbed_wide(field_t *F, line_rec_t *L, int is_end, int is_blanks,
 517     vchar_t delimiter)
 518 {
 519         wchar_t *S = L->l_data.wp;
 520         wchar_t *T = S;
 521         wchar_t *eol = S + L->l_data_length;
 522         ssize_t field = is_end ? F->f_end_field : F->f_start_field;
 523         ssize_t offset = is_end ? F->f_end_offset : F->f_start_offset;
 524         ssize_t ret;
 525
 526         ASSERT(is_end || field > -1);
 527
 528         if (is_end && field == -1)
 529                 return (L->l_data_length);
 530
 531         while (field-- > 0) {
 532                 T = xwsnchr(T, delimiter.wc, eol - T);
 533                 if (T == NULL || T > eol)
 534                         return (L->l_data_length);
 535
 536                 T++;
 537         }
 538
 539         if ((!is_end || offset != 0) && is_blanks) {
 540                 while (W_IS_BLANK(*T))
 541                         T++;
 542         }
 543
 544         if ((ret = MAX(T - S, 0) + offset) >= L->l_data_length) {
 545                 if (L->l_data_length <= 0)
 546                         return (0);
 547                 if (S[L->l_data_length - 1] == delimiter.wc) {
 548                         return (L->l_data_length - 1);
 549                 } else {
 550                         return (L->l_data_length);
 551                 }
 552         }
 553
 554         if (is_end && offset == 0)
 555                 ret--;
 556
 557         return (ret);
 558 }
 559
 560 static void
 561 field_delimit_tabbed_wide(field_t *F, line_rec_t *L, ssize_t *start,
 562     ssize_t *end, vchar_t delimiter)
 563 {
 564         ASSERT(F->f_start_field > -1);
 565
 566         *start = field_boundary_tabbed_wide(F, L, 0, F->f_options &
 567             FIELD_IGNORE_BLANKS_START, delimiter);
 568         *end = field_boundary_tabbed_wide(F, L, 1, F->f_options &
 569             FIELD_IGNORE_BLANKS_END, delimiter);
 570 }
 571
 572 /*ARGSUSED*/
 573 ssize_t
 574 field_convert_month(field_t *F, line_rec_t *L, vchar_t delimiter,
 575     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
 576 {
 577         int j;
 578         ssize_t val;
 579         char month_candidate[MAX_MON_LEN * MB_LEN_MAX];
 580         ssize_t month_length = data_length;
 581         ssize_t month_offset = data_offset;
 582
 583         if (sizeof (char) > L->l_collate_bufsize - coll_offset)
 584                 return (-1);
 585
 586         (void) memset(month_candidate, 0, MAX_MON_LEN * MB_LEN_MAX);
 587
 588
 589         /*
 590          * The month field formally begins with the first non-blank character.
 591          */
 592         while (IS_BLANK(*(L->l_data.sp + month_offset))) {
 593                 month_offset++;
 594                 month_length--;
 595         }
 596
 597         for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
 598                 month_candidate[j] = toupper((L->l_data.sp + month_offset)[j]);
 599
 600         for (j = 0; j < MONTHS_IN_YEAR; j++) {
 601                 if (xstrneql(month_candidate, months[j], month_lengths[j])) {
 602                         *(L->l_collate.sp + coll_offset) = '\0' + j + MO_OFFSET;
 603                         return (1);
 604                 }
 605         }
 606
 607         /*
 608          * no matching month; copy string into field.  required behaviour is
 609          * that "month-free" keys sort before month-sortable keys, so insert
 610          * a "will sort first" token.
 611          */
 612         *(L->l_collate.sp + coll_offset) = '\0' + MO_NONE;
 613
 614         val = field_convert_alpha_simple(F, L, delimiter, data_offset,
 615             data_length, coll_offset + 1);
 616
 617         if (val < 0)
 618                 return (-1);
 619         else
 620                 return (val + 1);
 621 }
 622
 623 /*ARGSUSED*/
 624 ssize_t
 625 field_convert_month_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
 626     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
 627 {
 628         ssize_t j;
 629         ssize_t val;
 630         wchar_t month_candidate[MAX_MON_LEN];
 631         wchar_t *month;
 632         wchar_t *buffer = L->l_collate.wp + coll_offset;
 633         ssize_t month_length = data_length;
 634         ssize_t month_offset = data_offset;
 635
 636         if (L->l_collate_bufsize - coll_offset * sizeof (wchar_t) <
 637             sizeof (wchar_t))
 638                 return (-1);
 639
 640         (void) memset(month_candidate, 0, MAX_MON_LEN * sizeof (wchar_t));
 641
 642
 643         while (W_IS_BLANK(*(L->l_data.wp + month_offset))) {
 644                 month_offset++;
 645                 month_length--;
 646         }
 647
 648         month = L->l_data.wp + month_offset;
 649
 650         for (j = 0; j < MAX_MON_LEN && j < month_length; j++)
 651                 month_candidate[j] = towupper(month[j]);
 652
 653         for (j = 0; j < MONTHS_IN_YEAR; j++)
 654                 if (xwcsneql(month_candidate, w_months[j],
 655                     w_month_lengths[j])) {
 656                         *buffer = L'\0' + j + MO_OFFSET;
 657                         return (1);
 658                 }
 659
 660         *buffer = L'\0' + MO_NONE;
 661
 662         val = field_convert_alpha_wide(F, L, delimiter, data_offset,
 663             data_length, coll_offset + sizeof (wchar_t));
 664
 665         if (val < 0)
 666                 return (-1);
 667         else
 668                 return (val + 1);
 669 }
 670
 671 /*
 672  * field_convert_alpha() always fails with return value -1 if the converted
 673  * string would cause l_collate_length to exceed l_collate_bufsize
 674  */
 675 /*ARGSUSED*/
 676 ssize_t
 677 field_convert_alpha(field_t *F, line_rec_t *L, vchar_t delimiter,
 678     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
 679 {
 680         static char *compose;
 681         static ssize_t compose_length;
 682
 683         ssize_t clength = 0;
 684         ssize_t dlength;
 685         ssize_t i;
 686
 687         if (compose_length < (data_length + 1)) {
 688                 compose_length = data_length + 1;
 689                 compose = safe_realloc(compose, compose_length * sizeof (char));
 690         }
 691
 692         for (i = data_offset; i < data_offset + data_length; i++) {
 693                 char t = (L->l_data.sp)[i];
 694
 695                 if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) &&
 696                     !isprint((uchar_t)t))
 697                         continue;
 698
 699                 if ((F->f_options & FIELD_DICTIONARY_ORDER) &&
 700                     !isalnum((uchar_t)t) && !isspace((uchar_t)t))
 701                         continue;
 702
 703                 if (F->f_options & FIELD_FOLD_UPPERCASE)
 704                         t = toupper(t);
 705
 706                 compose[clength++] = t;
 707         }
 708         compose[clength] = '\0';
 709
 710         if ((dlength = xfrm_ops->sx_len(compose, clength)) <
 711             L->l_collate_bufsize - coll_offset)
 712                 return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
 713                     compose, dlength + 1));
 714         else
 715                 return ((ssize_t)-1);
 716 }
 717
 718 /*ARGSUSED*/
 719 ssize_t
 720 field_convert_alpha_simple(field_t *F, line_rec_t *L, vchar_t delimiter,
 721     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
 722 {
 723         static char *compose;
 724         static ssize_t compose_length;
 725
 726         ssize_t clength;
 727         ssize_t dlength;
 728
 729         if (compose_length < (data_length + 1)) {
 730                 compose_length = data_length + 1;
 731                 compose = safe_realloc(compose, compose_length * sizeof (char));
 732         }
 733
 734         (void) memcpy(compose, L->l_data.sp + data_offset, data_length);
 735         clength = data_length;
 736         compose[clength] = '\0';
 737
 738         if ((dlength = xfrm_ops->sx_len(compose, clength)) <
 739             L->l_collate_bufsize - coll_offset)
 740                 return (xfrm_ops->sx_xfrm(L->l_collate.sp + coll_offset,
 741                     compose, dlength + 1));
 742         else
 743                 return ((ssize_t)-1);
 744 }
 745
 746 /*ARGSUSED*/
 747 ssize_t
 748 field_convert_alpha_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
 749     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
 750 {
 751         wchar_t *compose = safe_realloc(NULL, (data_length + 1) *
 752             sizeof (wchar_t));
 753         ssize_t clength = 0;
 754         ssize_t dlength;
 755         ssize_t i;
 756         ssize_t ret;
 757
 758         for (i = data_offset; i < data_offset + data_length; i++) {
 759                 wchar_t t = (L->l_data.wp)[i];
 760
 761                 if ((F->f_options & FIELD_IGNORE_NONPRINTABLES) && !iswprint(t))
 762                         continue;
 763
 764                 if ((F->f_options & FIELD_DICTIONARY_ORDER) && !iswalnum(t) &&
 765                     !iswspace(t))
 766                         continue;
 767
 768                 if (F->f_options & FIELD_FOLD_UPPERCASE)
 769                         t = towupper(t);
 770
 771                 compose[clength++] = t;
 772         }
 773         compose[clength] = L'\0';
 774
 775         dlength = wcsxfrm(NULL, compose, (size_t)0);
 776         if ((dlength * sizeof (wchar_t)) < L->l_collate_bufsize -
 777             coll_offset * sizeof (wchar_t)) {
 778                 ret = (ssize_t)wcsxfrm(L->l_collate.wp + coll_offset, compose,
 779                     (size_t)dlength + 1);
 780         } else {
 781                 ret = (ssize_t)-1;
 782         }
 783
 784         safe_free(compose);
 785
 786         return (ret);
 787 }
 788
 789 /*
 790  * field_convert_numeric() converts the given field into a collatable numerical
 791  * sequence.  The sequence is ordered as { log, integer, separator, fraction },
 792  * with an optional sentinel component at the sequence end.
 793  */
 794 /*ARGSUSED*/
 795 ssize_t
 796 field_convert_numeric(field_t *F, line_rec_t *L, vchar_t delimiter,
 797     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
 798 {
 799         char *number;
 800         char *buffer = L->l_collate.sp + coll_offset;
 801         ssize_t length;
 802
 803         char sign = '2';
 804         int log_ten;
 805         char *digits = buffer + 1 + sizeof (int) / sizeof (char);
 806         size_t j = 0;
 807         size_t i;
 808
 809         int state = BEFORE_NUMBER;
 810
 811         number = L->l_data.sp + data_offset;
 812         length = data_length;
 813
 814         /*
 815          * Eat leading blanks, if any.
 816          */
 817         for (i = 0; i < length; i++)
 818                 if (!IS_BLANK(number[i]))
 819                         break;
 820
 821         /*
 822          * Test that there is sufficient size in the collation buffer for our
 823          * number.  In addition to the possible remaining characters in the
 824          * field, we also require space for the sign (char), logarithm (int),
 825          * separator (char), and as many as two string terminators (for reverse
 826          * sorts).
 827          */
 828         if (((length - i) + 4 * sizeof (char) + sizeof (int)) >
 829             (L->l_collate_bufsize - coll_offset))
 830                 return ((ssize_t)-1);
 831
 832         /*
 833          * If negative, set sign.
 834          */
 835         if (number[i] == '-') {
 836                 i++;
 837                 sign = '0';
 838         }
 839
 840         /*
 841          * Scan integer part; eat leading zeros.
 842          */
 843         for (; i < length; i++) {
 844                 if (IS_SEPARATOR(number[i]))
 845                         continue;
 846
 847                 if (number[i] == '0' && !(state & IN_NUMBER))
 848                         continue;
 849
 850                 if (!isdigit((uchar_t)number[i]))
 851                         break;
 852
 853                 state |= IN_NUMBER;
 854                 if (sign == '0')
 855                         digits[j++] = '0' + '9' - number[i];
 856                 else
 857                         digits[j++] = number[i];
 858         }
 859
 860         if (i < length && IS_DECIMAL(number[i])) {
 861                 /*
 862                  * Integer part terminated by decimal.
 863                  */
 864                 digits[j] = DECIMAL_CHAR;
 865                 log_ten = j++;
 866
 867                 /*
 868                  * Scan fractional part.
 869                  */
 870                 for (++i; i < length; i++) {
 871                         if (IS_SEPARATOR(number[i]))
 872                                 continue;
 873
 874                         if (!isdigit((uchar_t)number[i]))
 875                                 break;
 876
 877                         if (number[i] != '0')
 878                                 state |= IN_NUMBER;
 879
 880                         if (sign == '0')
 881                                 digits[j++] = '0' + '9' - number[i];
 882                         else
 883                                 digits[j++] = number[i];
 884                 }
 885
 886                 if (sign == '0')
 887                         digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
 888         } else {
 889                 /*
 890                  * Nondigit or end of string seen.
 891                  */
 892                 log_ten = (int)j;
 893                 if (sign == '0')
 894                         digits[j++] = (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
 895                 else
 896                         digits[j] = INTERFIELD_SEPARATOR;
 897         }
 898
 899         if ((state & IN_NUMBER) == 0) {
 900                 /*
 901                  * A non-zero number was not detected; treat as defined zero.
 902                  */
 903                 sign = '1';
 904                 log_ten = 0;
 905                 digits[0] = '0';
 906                 j = 1;
 907         }
 908
 909         /*
 910          * We subtract a constant from the log of negative values so that
 911          * they will correctly precede positive values with a zero logarithm.
 912          */
 913         if (sign == '0') {
 914                 if (j != 0)
 915                         log_ten = -log_ten - 2;
 916                 else
 917                         /*
 918                          * Special case for -0.
 919                          */
 920                         log_ten = -1;
 921         }
 922
 923         buffer[0] = sign;
 924
 925         /*
 926          * Place logarithm in big-endian form.
 927          */
 928         for (i = 0; i < sizeof (int); i++)
 929                 buffer[i + 1] = (log_ten << (i * NBBY))
 930                     >> ((sizeof (int) - 1) * NBBY);
 931
 932         if (j + sizeof (char) + sizeof (int) <
 933             L->l_collate_bufsize - coll_offset)
 934                 return (j + 1 + sizeof (int));
 935         else
 936                 return ((ssize_t)-1);
 937 }
 938
 939 /*ARGSUSED*/
 940 ssize_t
 941 field_convert_numeric_wide(field_t *F, line_rec_t *L, vchar_t delimiter,
 942     ssize_t data_offset, ssize_t data_length, ssize_t coll_offset)
 943 {
 944         wchar_t *number;
 945         wchar_t *buffer = L->l_collate.wp + coll_offset;
 946         char *lbuffer;
 947         ssize_t length;
 948
 949         wchar_t sign = L'2';
 950         int log_ten;
 951         wchar_t *digits = buffer + 1 + sizeof (int)/sizeof (wchar_t);
 952         size_t j = 0;
 953         size_t i;
 954
 955         int state = BEFORE_NUMBER;
 956
 957         number = L->l_data.wp + data_offset;
 958         length = data_length;
 959
 960         for (i = 0; i < length; i++)
 961                 if (!W_IS_BLANK(number[i]))
 962                         break;
 963
 964         if (((length - i) * sizeof (wchar_t) + 4 * sizeof (wchar_t) +
 965             sizeof (int)) > (L->l_collate_bufsize - coll_offset))
 966                 return ((ssize_t)-1);
 967
 968         if (number[i] == L'-') {
 969                 i++;
 970                 sign = L'0';
 971         }
 972
 973         for (; i < length; i++) {
 974                 if (W_IS_SEPARATOR(number[i]))
 975                         continue;
 976
 977                 if (number[i] == L'0' && !(state & IN_NUMBER))
 978                         continue;
 979
 980                 if (!iswdigit(number[i]))
 981                         break;
 982
 983                 state |= IN_NUMBER;
 984                 if (sign == L'0')
 985                         digits[j++] = L'0' + L'9' - number[i];
 986                 else
 987                         digits[j++] = number[i];
 988         }
 989
 990         if (i < length && W_IS_DECIMAL(number[i])) {
 991                 digits[j] = W_DECIMAL_CHAR;
 992                 log_ten = j++;
 993
 994                 for (++i; i < length; i++) {
 995                         if (W_IS_SEPARATOR(number[i]))
 996                                 continue;
 997
 998                         if (!iswdigit(number[i]))
 999                                 break;
1000
1001                         if (number[i] != L'0')
1002                                 state |= IN_NUMBER;
1003
1004                         if (sign == L'0')
1005                                 digits[j++] = L'0' + L'9' - number[i];
1006                         else
1007                                 digits[j++] = number[i];
1008                 }
1009
1010                 if (sign == L'0')
1011                         digits[j++] = (wchar_t)(WCHAR_MAX -
1012                             W_INTERFIELD_SEPARATOR);
1013         } else {
1014                 log_ten = (int)j;
1015                 if (sign == L'0')
1016                         digits[j++] = (wchar_t)(WCHAR_MAX -
1017                             W_INTERFIELD_SEPARATOR);
1018                 else
1019                         digits[j] = W_INTERFIELD_SEPARATOR;
1020         }
1021
1022         if ((state & IN_NUMBER) == 0) {
1023                 sign = L'1';
1024                 log_ten = 0;
1025                 digits[0] = L'0';
1026                 j = 1;
1027         }
1028
1029         if (sign == L'0') {
1030                 if (j != 0)
1031                         log_ten = -log_ten - 2;
1032                 else
1033                         log_ten = -1;
1034         }
1035
1036         buffer[0] = sign;
1037         /*
1038          * Place logarithm in big-endian form.
1039          */
1040         lbuffer = (char *)(buffer + 1);
1041         for (i = 0; i < sizeof (int); i++)
1042                 lbuffer[i] = (log_ten << (i * NBBY))
1043                     >> ((sizeof (int) - 1) * NBBY);
1044
1045         if ((j + 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) <
1046             L->l_collate_bufsize - coll_offset * sizeof (wchar_t))
1047                 return (j + 1 + sizeof (int) / sizeof (wchar_t));
1048         else
1049                 return ((ssize_t)-1);
1050 }
1051
1052 /*
1053  * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour
1054  * when coll_offset exceeds l_collate_bufsize.
1055  */
1056 ssize_t
1057 field_convert(field_t *F, line_rec_t *L, int flags, vchar_t field_separator)
1058 {
1059         ssize_t coll_offset = 0;
1060         ssize_t start, end, distance;
1061         field_t *cur_fieldp = F;
1062
1063         while (cur_fieldp != NULL) {
1064                 /*
1065                  * delimit field
1066                  */
1067                 if (!field_separator.sc)
1068                         field_delimit(cur_fieldp, L, &start, &end);
1069                 else
1070                         field_delimit_tabbed(cur_fieldp, L, &start, &end,
1071                             field_separator);
1072
1073                 distance = 0;
1074                 if (end - start > 0 ||
1075                     (end - start == 0 && F->f_species == NUMERIC)) {
1076                         /*
1077                          * Convert field, appending to collated field of line
1078                          * record.
1079                          */
1080                         distance = cur_fieldp->f_convert(cur_fieldp, L,
1081                             field_separator, start, end - start, coll_offset);
1082
1083                         /*
1084                          * branch should execute comparatively rarely
1085                          */
1086                         if (distance == -1) {
1087                                 if (flags & FCV_REALLOC) {
1088                                         ASSERT(L->l_collate_bufsize > 0);
1089                                         L->l_collate_bufsize *= 2;
1090                                         L->l_collate.sp =
1091                                             safe_realloc(L->l_collate.sp,
1092                                             L->l_collate_bufsize);
1093
1094                                         __S(stats_incr_convert_reallocs());
1095                                         continue;
1096                                 } else {
1097                                         /*
1098                                          * FCV_FAIL has been set.
1099                                          */
1100                                         return (-1);
1101                                 }
1102                         }
1103                 }
1104
1105                 if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
1106                         xstrninv(L->l_collate.sp, coll_offset, distance);
1107                         *(L->l_collate.sp + coll_offset + distance) =
1108                             (char)(UCHAR_MAX - INTERFIELD_SEPARATOR);
1109                         distance++;
1110                 }
1111
1112                 ASSERT(distance >= 0);
1113                 coll_offset += distance;
1114                 if (coll_offset >= L->l_collate_bufsize) {
1115                         if (flags & FCV_REALLOC) {
1116                                 ASSERT(L->l_collate_bufsize > 0);
1117                                 L->l_collate_bufsize *= 2;
1118                                 L->l_collate.sp = safe_realloc(L->l_collate.sp,
1119                                     L->l_collate_bufsize);
1120
1121                                 __S(stats_incr_convert_reallocs());
1122                         } else {
1123                                 return (-1);
1124                         }
1125                 }
1126                 *(L->l_collate.sp + coll_offset) = INTERFIELD_SEPARATOR;
1127                 coll_offset++;
1128
1129                 cur_fieldp = cur_fieldp->f_next;
1130         }
1131
1132         L->l_collate_length = coll_offset;
1133
1134         return (L->l_collate_length);
1135 }
1136
1137 ssize_t
1138 field_convert_wide(field_t *F, line_rec_t *L, int flags,
1139     vchar_t field_separator)
1140 {
1141         ssize_t coll_offset = 0;
1142         ssize_t start, end, distance;
1143         field_t *cur_fieldp = F;
1144
1145         while (cur_fieldp != NULL) {
1146                 if (!field_separator.wc)
1147                         field_delimit_wide(cur_fieldp, L, &start, &end);
1148                 else
1149                         field_delimit_tabbed_wide(cur_fieldp, L, &start, &end,
1150                             field_separator);
1151
1152                 distance = 0;
1153                 if (end - start > 0 ||
1154                     end - start == 0 && F->f_species == NUMERIC) {
1155                         distance = cur_fieldp->f_convert(cur_fieldp, L,
1156                             field_separator, start, end - start, coll_offset);
1157
1158                         if (distance == -1) {
1159                                 if (flags & FCV_REALLOC) {
1160                                         ASSERT(L->l_collate_bufsize > 0);
1161                                         L->l_collate_bufsize *= 2;
1162                                         L->l_collate.wp = safe_realloc(
1163                                             L->l_collate.wp,
1164                                             L->l_collate_bufsize);
1165
1166                                         __S(stats_incr_convert_reallocs());
1167                                         continue;
1168                                 } else {
1169                                         return (-1);
1170                                 }
1171                         }
1172                 }
1173
1174                 if (cur_fieldp->f_options & FIELD_REVERSE_COMPARISONS) {
1175                         xwcsninv(L->l_collate.wp, coll_offset, distance);
1176                         *(L->l_collate.wp + coll_offset + distance) =
1177                             WCHAR_MAX - INTERFIELD_SEPARATOR;
1178                         distance++;
1179                 }
1180
1181                 ASSERT(distance >= 0);
1182                 coll_offset += distance;
1183                 if (coll_offset * sizeof (wchar_t) >= L->l_collate_bufsize) {
1184                         if (flags & FCV_REALLOC) {
1185                                 ASSERT(L->l_collate_bufsize > 0);
1186                                 L->l_collate_bufsize *= 2;
1187                                 L->l_collate.wp = safe_realloc(L->l_collate.wp,
1188                                     L->l_collate_bufsize);
1189
1190                                 __S(stats_incr_convert_reallocs());
1191                         } else {
1192                                 return (-1);
1193                         }
1194                 }
1195                 *(L->l_collate.wp + coll_offset) = W_INTERFIELD_SEPARATOR;
1196                 coll_offset++;
1197
1198                 cur_fieldp = cur_fieldp->f_next;
1199         }
1200
1201         L->l_collate_length = coll_offset * sizeof (wchar_t);
1202 #ifdef _LITTLE_ENDIAN
1203         xwcsntomsb(L->l_collate.wp, coll_offset);
1204 #endif /* _LITTLE_ENDIAN */
1205
1206         return (L->l_collate_length);
1207 }
1208
1209 /*
1210  * line_convert() and line_convert_wide() are called when the collation vector
1211  * of a given line has been exhausted, and we are performing the final,
1212  * full-line comparison required by the sort specification.  Because we do not
1213  * have a guarantee that l_data is null-terminated, we create an explicitly
1214  * null-terminated copy suitable for transformation to a collatable form for the
1215  * current locale.
1216  */
1217 static void
1218 line_convert(line_rec_t *L)
1219 {
1220         static ssize_t bufsize;
1221         static char *buffer;
1222
1223         if (L->l_raw_collate.sp != NULL)
1224                 return;
1225
1226         if (L->l_data_length + 1 > bufsize) {
1227                 buffer = safe_realloc(buffer, L->l_data_length + 1);
1228                 bufsize = L->l_data_length + 1;
1229         }
1230
1231         (void) strncpy(buffer, L->l_data.sp, L->l_data_length);
1232         buffer[L->l_data_length] = '\0';
1233
1234         L->l_raw_collate.sp = safe_realloc(L->l_raw_collate.sp,
1235             xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
1236         xfrm_ops->sx_xfrm(L->l_raw_collate.sp, buffer,
1237             xfrm_ops->sx_len(buffer, L->l_data_length) + 1);
1238
1239         __S(stats_incr_line_conversions());
1240 }
1241
1242 static void
1243 line_convert_wide(line_rec_t *L)
1244 {
1245         static wchar_t *buffer;
1246         static ssize_t bufsize;
1247
1248         ssize_t dlength;
1249
1250         if (L->l_raw_collate.wp != NULL)
1251                 return;
1252
1253         if (L->l_data_length + 1 > bufsize) {
1254                 buffer = safe_realloc(buffer, (L->l_data_length + 1) *
1255                     sizeof (wchar_t));
1256                 bufsize = L->l_data_length + 1;
1257         }
1258
1259         (void) wcsncpy(buffer, L->l_data.wp, L->l_data_length);
1260         buffer[L->l_data_length] = L'\0';
1261
1262         dlength = wcsxfrm(NULL, buffer, 0) + 1;
1263         L->l_raw_collate.wp = safe_realloc(L->l_raw_collate.wp, dlength *
1264             sizeof (wchar_t));
1265         (void) wcsxfrm(L->l_raw_collate.wp, buffer, dlength);
1266
1267         __S(stats_incr_line_conversions());
1268 }
1269
1270 /*
1271  * Our convention for collation is
1272  *
1273  *      A > B  => r > 0,
1274  *      A == B => r = 0,
1275  *      A < B  => r < 0
1276  *
1277  * This convention is consistent with the definition of memcmp(), strcmp(), and
1278  * strncmp() in the C locale.  collated() and collated_wide() have two optional
1279  * behaviours, which can be activated by setting the appropriate values in
1280  * coll_flag:  COLL_UNIQUE, which returns 0 if the l_collate fields of the line
1281  * records being compared are identical; COLL_DATA_ONLY, which ignores the
1282  * l_collate field for the current comparison; and COLL_REVERSE, which flips the
1283  * result for comparisons that fall through to an actual data comparison (since
1284  * the collated vector should already reflect reverse ordering from field
1285  * conversion).
1286  */
1287 int
1288 collated(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
1289 {
1290         ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
1291         int r;
1292         int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
1293             INT_SIGN_PASS_MASK;
1294         ssize_t la, lb;
1295
1296         if (!(coll_flag & COLL_DATA_ONLY)) {
1297                 if (ml > 0) {
1298                         r = memcmp(A->l_collate.sp + depth,
1299                             B->l_collate.sp + depth, ml);
1300
1301                         if (r)
1302                                 return (r);
1303                 }
1304
1305                 if (A->l_collate_length < B->l_collate_length)
1306                         return (-1);
1307
1308                 if (A->l_collate_length > B->l_collate_length)
1309                         return (1);
1310         }
1311
1312         /*
1313          * This is where we cut out, if we know that the current sort is over
1314          * the entire line.
1315          */
1316         if (coll_flag & COLL_UNIQUE)
1317                 return (0);
1318
1319         line_convert(A);
1320         line_convert(B);
1321
1322         la = strlen(A->l_raw_collate.sp);
1323         lb = strlen(B->l_raw_collate.sp);
1324
1325         r = memcmp(A->l_raw_collate.sp, B->l_raw_collate.sp, MIN(la, lb));
1326
1327         if (r)
1328                 return (r ^ mask);
1329
1330         if (la < lb)
1331                 return (-1 ^ mask);
1332
1333         if (la > lb)
1334                 return (1 ^ mask);
1335
1336         return (0);
1337 }
1338
1339 int
1340 collated_wide(line_rec_t *A, line_rec_t *B, ssize_t depth, flag_t coll_flag)
1341 {
1342         ssize_t ml = MIN(A->l_collate_length, B->l_collate_length) - depth;
1343         int r;
1344         int mask = (coll_flag & COLL_REVERSE) ? INT_SIGN_FLIP_MASK :
1345             INT_SIGN_PASS_MASK;
1346         ssize_t la, lb;
1347
1348         if (!(coll_flag & COLL_DATA_ONLY)) {
1349                 if (ml > 0) {
1350                         r = memcmp(A->l_collate.sp + depth,
1351                             B->l_collate.sp + depth, ml);
1352
1353                         if (r)
1354                                 return (r);
1355                 }
1356                 if (A->l_collate_length < B->l_collate_length)
1357                         return (-1);
1358
1359                 if (A->l_collate_length > B->l_collate_length)
1360                         return (1);
1361         }
1362
1363         if (coll_flag & COLL_UNIQUE)
1364                 return (0);
1365
1366         line_convert_wide(A);
1367         line_convert_wide(B);
1368
1369         la = wcslen(A->l_raw_collate.wp);
1370         lb = wcslen(B->l_raw_collate.wp);
1371
1372         r = wmemcmp(A->l_raw_collate.wp, B->l_raw_collate.wp,
1373             (size_t)MIN(la, lb));
1374
1375         if (r)
1376                 return (r ^ mask);
1377
1378         if (la < lb)
1379                 return (-1 ^ mask);
1380
1381         if (la > lb)
1382                 return (1 ^ mask);
1383
1384         return (0);
1385 }