4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
32 * By a field, we mean the various delimited character sequences within each
33 * line of the input files. The sort key consists of an ordered sequence of
34 * fields, which need not include all possible fields for the given line.
35 * (Furthermore, not every line need contain sufficient fields for the fields
36 * given within the sort key. In fact, none of the lines in the input stream
37 * need contain sufficient fields.)
39 * There are two methods for specifying fields for sort(1); these are
40 * discussed in options.c. Here we discuss only the internal representation
41 * of fields, as used for constructing the collation vector for each line as
42 * defined by the sort key.
45 * The sort key is a singly-linked list of field specifiers. At present,
46 * fields may belong to one of three species: alphabetical, numerical, or
47 * monthly; the species (f_species) then indicates the conversion function
48 * (f_convert) used to transform the raw characters of the character sequence
49 * to a collatable form. (In principle, this allows us to consider future
50 * field species such as hexadecimal.)
52 * Fields and offsets are numbered such that zero refers to the first field or
53 * character, respectively. Thus, the interpretation of a key specifier, m.n,
54 * is that the field begins at the nth character beyond the mth occurence of
55 * the key separator. If the blanks flag has been specified, then the field
56 * begins at the nth non-blank character past the mth key separator. If the
57 * key separator is unspecified, then the key separator is defined as one or
58 * more blank characters.
60 * In general, the various options afforded by sort may be broken into two
61 * categories: field species and field modifiers. For each field species,
62 * there is one or more conversion routines that take a delimited character
63 * sequence and convert it to a character sequence collatable by strcmp() or
64 * memcmp(). For field species that may be further modified, such as the
65 * fold-to-uppercase option for alphabetic fields, the conversion routine may
66 * be aware of how the modifier affects collation. Finally, the no-modifiers
67 * case may present an opportunity for a simplified, faster version.
70 * The code paths for single-byte and multi-byte locales diverge significantly
71 * in fields.c. Most routines have an *_wide() version, which produces an
72 * equivalent effect for line records whose data field is composed of wide
73 * characters (wchar_t). However, the l_collated field of a line record is
74 * always composed of characters, so that the radix sorts provided in
75 * internal.c can work in both single- and multi-byte locales. Thus, in the
76 * various convert_*_wide() routines, the output is placed in l_collated, with
77 * a length multiplier of 4.
80 #define BEFORE_NUMBER 0x0
83 static char numerical_separator
;
84 static char numerical_decimal
;
85 static char monetary_separator
;
86 static char monetary_decimal
;
88 static wchar_t w_numerical_separator
;
89 static wchar_t w_numerical_decimal
;
90 static wchar_t w_monetary_separator
;
91 static wchar_t w_monetary_decimal
;
93 #define MONTHS_IN_YEAR 12
94 #define MAX_MON_LEN 20
96 enum { MO_NONE
= 1, MO_OFFSET
= 2 };
98 static char *months
[MONTHS_IN_YEAR
];
99 static size_t month_lengths
[MONTHS_IN_YEAR
];
100 static wchar_t *w_months
[MONTHS_IN_YEAR
];
101 static size_t w_month_lengths
[MONTHS_IN_YEAR
];
103 #define DECIMAL_CHAR (numerical_decimal)
104 #define IS_BLANK(x) (isspace((uchar_t)(x)) && (x) != '\n')
105 #define IS_SEPARATOR(x) \
106 ((numerical_separator != '\0' && (x) == numerical_separator) || \
107 (monetary_separator != '\0' && (x) == monetary_separator))
108 #define IS_DECIMAL(x) \
109 ((x) == numerical_decimal || \
110 (monetary_decimal != '\0' && (x) == monetary_decimal))
111 #define W_DECIMAL_CHAR (w_numerical_decimal)
112 #define W_IS_BLANK(x) (iswspace(x) && (x) != L'\n')
113 #define W_IS_SEPARATOR(x) \
114 ((numerical_separator != '\0' && (x) == w_numerical_separator) || \
115 (monetary_separator != '\0' && (x) == w_monetary_separator))
116 #define W_IS_DECIMAL(x) \
117 (((x) == w_numerical_decimal) || \
118 (monetary_decimal != '\0' && (x) == w_monetary_decimal))
120 #define INTERFIELD_SEPARATOR '\0'
121 #define W_INTERFIELD_SEPARATOR L'\0'
123 #define INT_SIGN_FLIP_MASK 0x80000000
124 #define INT_SIGN_PASS_MASK 0x00000000
127 * strx_ops_t, xfrm_len, and xfrm_cpy: In the case where we are sorting in the
128 * C locale, we want to avoid the expense of transforming strings to collatable
129 * forms since, by definition, an arbitrary string in the C locale is already in
130 * its collatable form. Therefore, we construct a small ops vector (the
131 * strx_ops) and two wrappers: xfrm_len() to massage the strxfrm(NULL, ...) into
132 * strlen()-like behaviour, and xfrm_cpy() to make strncpy() appear
137 xfrm_len(const char *s2
, size_t len
)
139 return (strxfrm(NULL
, s2
, 0) + 1);
143 * The length represented by n includes a null character, so to return the
144 * correct length we subtract 1. Note that this function is only used by
145 * field_convert_alpha, and isn't for general use, as it assumes that n is the
146 * length of s2 plus a null character.
149 C_ncpy(char *s1
, const char *s2
, size_t n
)
151 (void) strncpy(s1
, s2
, n
);
157 C_len(const char *s
, size_t len
)
163 typedef struct _strx_ops
{
164 size_t (*sx_len
)(const char *, size_t);
165 size_t (*sx_xfrm
)(char *, const char *, size_t);
168 static const strx_ops_t C_ops
= { C_len
, C_ncpy
};
169 static const strx_ops_t SB_ops
= { xfrm_len
, strxfrm
};
171 static const strx_ops_t
*xfrm_ops
;
174 field_initialize_separator(void)
177 * A locale need not define all of the cases below: only decimal_point
178 * must be defined. Furthermore, sort(1) has traditionally not used the
179 * positive_sign and negative_sign, grouping, or currency_symbols (or
180 * their numeric counterparts, if any).
182 struct lconv
*conv
= localeconv();
184 if (!xstreql(conv
->thousands_sep
, "")) {
185 numerical_separator
= *conv
->thousands_sep
;
186 (void) mbtowc(&w_numerical_separator
, conv
->thousands_sep
,
189 numerical_separator
= '\0';
191 if (!xstreql(conv
->mon_thousands_sep
, "")) {
192 monetary_separator
= *conv
->mon_thousands_sep
;
193 (void) mbtowc(&w_monetary_separator
, conv
->mon_thousands_sep
,
196 monetary_separator
= '\0';
198 if (!xstreql(conv
->mon_decimal_point
, "")) {
199 monetary_decimal
= *conv
->mon_decimal_point
;
200 (void) mbtowc(&w_monetary_decimal
, conv
->mon_decimal_point
,
203 monetary_decimal
= '\0';
205 numerical_decimal
= *conv
->decimal_point
;
206 (void) mbtowc(&w_numerical_decimal
, conv
->decimal_point
, MB_CUR_MAX
);
210 field_initialize_month(int is_c_locale
)
214 struct tm this_month
;
215 const char *c_months
[MONTHS_IN_YEAR
] = {
216 "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
217 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"
220 char month_name
[MAX_MON_LEN
* MB_LEN_MAX
];
221 wchar_t w_month_name
[MAX_MON_LEN
];
224 for (i
= 0; i
< MONTHS_IN_YEAR
; i
++) {
225 months
[i
] = (char *)c_months
[i
];
226 month_lengths
[i
] = strlen(c_months
[i
]);
229 * We don't need to initialize the wide version of the month
235 (void) memset(&this_month
, 0, sizeof (this_month
));
237 for (i
= 0; i
< MONTHS_IN_YEAR
; i
++) {
238 this_month
.tm_mon
= i
;
240 (void) strftime(month_name
, sizeof (month_name
),
243 for (j
= 0; j
< strlen(month_name
); j
++)
244 month_name
[j
] = toupper(month_name
[j
]);
245 (void) mbstowcs(w_month_name
, month_name
, MAX_MON_LEN
);
247 months
[i
] = strdup(month_name
);
248 month_lengths
[i
] = strlen(month_name
);
249 w_months
[i
] = wsdup(w_month_name
);
250 w_month_lengths
[i
] = wslen(w_month_name
);
255 field_initialize(sort_t
*S
)
257 field_initialize_month(S
->m_c_locale
);
258 field_initialize_separator();
269 field_t
*F
= safe_realloc(NULL
, sizeof (field_t
));
271 F
->f_start_field
= -1;
272 F
->f_start_offset
= -1;
274 F
->f_end_offset
= -1;
278 F
->f_species
= ALPHA
;
281 F
->f_species
= S
->m_default_species
;
282 F
->f_options
= S
->m_field_options
;
289 field_delete(field_t
*F
)
295 * The recursive implementation of field_add_to_chain() given below is
296 * inappropriate if function calls are expensive, or a truly large number of
297 * fields are anticipated.
300 field_add_to_chain(field_t
**F
, field_t
*A
)
305 field_add_to_chain(&((*F
)->f_next
), A
);
311 "\nStart field: %d\tStart offset: %d\nEnd field: %d\tEnd offset: %d\n"
314 "\nStart field: %ld\tStart offset: %ld\nEnd field: %ld\tEnd offset: %ld\n"
318 * field_print is used only for debugging purposes.
321 field_print(field_t
*F
)
323 char *field_names
[] = {"ALPHA", "MONTH", "NUMERIC"};
326 (void) fprintf(stderr
, "Type: %s", field_names
[F
->f_species
]);
327 (void) fprintf(stderr
, "\tOptions: ");
329 if (F
->f_options
& FIELD_REVERSE_COMPARISONS
) {
330 (void) fprintf(stderr
, "REVERSE");
333 if (F
->f_options
& FIELD_DICTIONARY_ORDER
) {
334 (void) fprintf(stderr
, "DICTIONARY ");
337 if (F
->f_options
& FIELD_FOLD_UPPERCASE
) {
338 (void) fprintf(stderr
, "UPPERCASE ");
341 if (F
->f_options
& FIELD_IGNORE_NONPRINTABLES
) {
342 (void) fprintf(stderr
, "PRINTABLES ");
345 if (F
->f_options
& FIELD_IGNORE_BLANKS_START
) {
346 (void) fprintf(stderr
, "BLANKS_START ");
349 if (F
->f_options
& FIELD_IGNORE_BLANKS_END
) {
350 (void) fprintf(stderr
, "BLANKS_END ");
355 (void) fprintf(stderr
, "NO_MODIFIERS");
357 (void) fprintf(stderr
, FIELD_FMT
, F
->f_start_field
, F
->f_start_offset
,
358 F
->f_end_field
, F
->f_end_offset
);
363 field_boundary(field_t
*F
, line_rec_t
*L
, int is_end
, int is_blanks
)
365 char *S
= L
->l_data
.sp
;
367 char *eol
= S
+ L
->l_data_length
;
368 ssize_t field
= is_end
? F
->f_end_field
: F
->f_start_field
;
369 ssize_t offset
= is_end
? F
->f_end_offset
: F
->f_start_offset
;
372 ASSERT(is_end
|| field
> -1);
374 if (is_end
&& field
== -1)
375 return (L
->l_data_length
);
377 while (field
-- > 0) {
378 while (T
< eol
&& IS_BLANK(*T
))
381 while (T
< eol
&& !IS_BLANK(*T
))
385 if ((!is_end
|| offset
> 0) && is_blanks
) {
390 if ((ret
= MAX(T
- S
, 0) + offset
) >= L
->l_data_length
)
391 return (L
->l_data_length
);
397 field_delimit(field_t
*F
, line_rec_t
*L
, ssize_t
*start
, ssize_t
*end
)
399 ASSERT(F
->f_start_field
> -1);
401 *start
= field_boundary(F
, L
, 0,
402 F
->f_options
& FIELD_IGNORE_BLANKS_START
);
403 *end
= field_boundary(F
, L
, 1,
404 F
->f_options
& FIELD_IGNORE_BLANKS_END
);
408 field_boundary_wide(field_t
*F
, line_rec_t
*L
, int is_end
, int is_blanks
)
410 wchar_t *S
= L
->l_data
.wp
;
412 wchar_t *eol
= S
+ L
->l_data_length
;
413 ssize_t field
= is_end
? F
->f_end_field
: F
->f_start_field
;
414 ssize_t offset
= is_end
? F
->f_end_offset
: F
->f_start_offset
;
417 ASSERT(is_end
|| field
> -1);
419 if (is_end
&& field
== -1)
420 return (L
->l_data_length
);
422 while (field
-- > 0) {
423 while (T
< eol
&& W_IS_BLANK(*T
))
426 while (T
< eol
&& !W_IS_BLANK(*T
))
430 if ((!is_end
|| offset
> 0) && is_blanks
) {
431 while (W_IS_BLANK(*T
))
435 if ((ret
= MAX(T
- S
, 0) + offset
) >= L
->l_data_length
)
436 return (L
->l_data_length
);
442 field_delimit_wide(field_t
*F
, line_rec_t
*L
, ssize_t
*start
, ssize_t
*end
)
444 ASSERT(F
->f_start_field
> -1);
446 *start
= field_boundary_wide(F
, L
, 0,
447 F
->f_options
& FIELD_IGNORE_BLANKS_START
);
448 *end
= field_boundary_wide(F
, L
, 1,
449 F
->f_options
& FIELD_IGNORE_BLANKS_END
);
453 field_boundary_tabbed(field_t
*F
, line_rec_t
*L
, int is_end
, int is_blanks
,
456 char *S
= L
->l_data
.sp
;
458 char *eol
= S
+ L
->l_data_length
;
459 ssize_t field
= is_end
? F
->f_end_field
: F
->f_start_field
;
460 ssize_t offset
= is_end
? F
->f_end_offset
: F
->f_start_offset
;
463 ASSERT(is_end
|| field
> -1);
465 if (is_end
&& field
== -1)
466 return (L
->l_data_length
);
468 while (field
-- > 0) {
469 T
= xstrnchr(T
, delimiter
.sc
, eol
- T
);
470 if (T
== NULL
|| T
> eol
)
471 return (L
->l_data_length
);
476 if ((!is_end
|| offset
!= 0) && is_blanks
) {
481 if ((ret
= MAX(T
- S
, 0) + offset
) >= L
->l_data_length
) {
482 if (L
->l_data_length
<= 0)
484 if (S
[L
->l_data_length
- 1] == delimiter
.sc
) {
485 return (L
->l_data_length
- 1);
487 return (L
->l_data_length
);
491 if (is_end
&& offset
== 0)
498 * field_delimit_tabbed() is called when a field separator has been defined
499 * using the -t option. The character at the offset, start, is either one or
500 * more character positions past the delimiter marking the start of the
501 * field, or at the end of the line.
504 field_delimit_tabbed(field_t
*F
, line_rec_t
*L
, ssize_t
*start
, ssize_t
*end
,
507 ASSERT(F
->f_start_field
> -1);
509 *start
= field_boundary_tabbed(F
, L
, 0, F
->f_options
&
510 FIELD_IGNORE_BLANKS_START
, delimiter
);
511 *end
= field_boundary_tabbed(F
, L
, 1, F
->f_options
&
512 FIELD_IGNORE_BLANKS_END
, delimiter
);
516 field_boundary_tabbed_wide(field_t
*F
, line_rec_t
*L
, int is_end
, int is_blanks
,
519 wchar_t *S
= L
->l_data
.wp
;
521 wchar_t *eol
= S
+ L
->l_data_length
;
522 ssize_t field
= is_end
? F
->f_end_field
: F
->f_start_field
;
523 ssize_t offset
= is_end
? F
->f_end_offset
: F
->f_start_offset
;
526 ASSERT(is_end
|| field
> -1);
528 if (is_end
&& field
== -1)
529 return (L
->l_data_length
);
531 while (field
-- > 0) {
532 T
= xwsnchr(T
, delimiter
.wc
, eol
- T
);
533 if (T
== NULL
|| T
> eol
)
534 return (L
->l_data_length
);
539 if ((!is_end
|| offset
!= 0) && is_blanks
) {
540 while (W_IS_BLANK(*T
))
544 if ((ret
= MAX(T
- S
, 0) + offset
) >= L
->l_data_length
) {
545 if (L
->l_data_length
<= 0)
547 if (S
[L
->l_data_length
- 1] == delimiter
.wc
) {
548 return (L
->l_data_length
- 1);
550 return (L
->l_data_length
);
554 if (is_end
&& offset
== 0)
561 field_delimit_tabbed_wide(field_t
*F
, line_rec_t
*L
, ssize_t
*start
,
562 ssize_t
*end
, vchar_t delimiter
)
564 ASSERT(F
->f_start_field
> -1);
566 *start
= field_boundary_tabbed_wide(F
, L
, 0, F
->f_options
&
567 FIELD_IGNORE_BLANKS_START
, delimiter
);
568 *end
= field_boundary_tabbed_wide(F
, L
, 1, F
->f_options
&
569 FIELD_IGNORE_BLANKS_END
, delimiter
);
574 field_convert_month(field_t
*F
, line_rec_t
*L
, vchar_t delimiter
,
575 ssize_t data_offset
, ssize_t data_length
, ssize_t coll_offset
)
579 char month_candidate
[MAX_MON_LEN
* MB_LEN_MAX
];
580 ssize_t month_length
= data_length
;
581 ssize_t month_offset
= data_offset
;
583 if (sizeof (char) > L
->l_collate_bufsize
- coll_offset
)
586 (void) memset(month_candidate
, 0, MAX_MON_LEN
* MB_LEN_MAX
);
590 * The month field formally begins with the first non-blank character.
592 while (IS_BLANK(*(L
->l_data
.sp
+ month_offset
))) {
597 for (j
= 0; j
< MAX_MON_LEN
&& j
< month_length
; j
++)
598 month_candidate
[j
] = toupper((L
->l_data
.sp
+ month_offset
)[j
]);
600 for (j
= 0; j
< MONTHS_IN_YEAR
; j
++) {
601 if (xstrneql(month_candidate
, months
[j
], month_lengths
[j
])) {
602 *(L
->l_collate
.sp
+ coll_offset
) = '\0' + j
+ MO_OFFSET
;
608 * no matching month; copy string into field. required behaviour is
609 * that "month-free" keys sort before month-sortable keys, so insert
610 * a "will sort first" token.
612 *(L
->l_collate
.sp
+ coll_offset
) = '\0' + MO_NONE
;
614 val
= field_convert_alpha_simple(F
, L
, delimiter
, data_offset
,
615 data_length
, coll_offset
+ 1);
625 field_convert_month_wide(field_t
*F
, line_rec_t
*L
, vchar_t delimiter
,
626 ssize_t data_offset
, ssize_t data_length
, ssize_t coll_offset
)
630 wchar_t month_candidate
[MAX_MON_LEN
];
632 wchar_t *buffer
= L
->l_collate
.wp
+ coll_offset
;
633 ssize_t month_length
= data_length
;
634 ssize_t month_offset
= data_offset
;
636 if (L
->l_collate_bufsize
- coll_offset
* sizeof (wchar_t) <
640 (void) memset(month_candidate
, 0, MAX_MON_LEN
* sizeof (wchar_t));
643 while (W_IS_BLANK(*(L
->l_data
.wp
+ month_offset
))) {
648 month
= L
->l_data
.wp
+ month_offset
;
650 for (j
= 0; j
< MAX_MON_LEN
&& j
< month_length
; j
++)
651 month_candidate
[j
] = towupper(month
[j
]);
653 for (j
= 0; j
< MONTHS_IN_YEAR
; j
++)
654 if (xwcsneql(month_candidate
, w_months
[j
],
655 w_month_lengths
[j
])) {
656 *buffer
= L
'\0' + j
+ MO_OFFSET
;
660 *buffer
= L
'\0' + MO_NONE
;
662 val
= field_convert_alpha_wide(F
, L
, delimiter
, data_offset
,
663 data_length
, coll_offset
+ sizeof (wchar_t));
672 * field_convert_alpha() always fails with return value -1 if the converted
673 * string would cause l_collate_length to exceed l_collate_bufsize
677 field_convert_alpha(field_t
*F
, line_rec_t
*L
, vchar_t delimiter
,
678 ssize_t data_offset
, ssize_t data_length
, ssize_t coll_offset
)
680 static char *compose
;
681 static ssize_t compose_length
;
687 if (compose_length
< (data_length
+ 1)) {
688 compose_length
= data_length
+ 1;
689 compose
= safe_realloc(compose
, compose_length
* sizeof (char));
692 for (i
= data_offset
; i
< data_offset
+ data_length
; i
++) {
693 char t
= (L
->l_data
.sp
)[i
];
695 if ((F
->f_options
& FIELD_IGNORE_NONPRINTABLES
) &&
696 !isprint((uchar_t
)t
))
699 if ((F
->f_options
& FIELD_DICTIONARY_ORDER
) &&
700 !isalnum((uchar_t
)t
) && !isspace((uchar_t
)t
))
703 if (F
->f_options
& FIELD_FOLD_UPPERCASE
)
706 compose
[clength
++] = t
;
708 compose
[clength
] = '\0';
710 if ((dlength
= xfrm_ops
->sx_len(compose
, clength
)) <
711 L
->l_collate_bufsize
- coll_offset
)
712 return (xfrm_ops
->sx_xfrm(L
->l_collate
.sp
+ coll_offset
,
713 compose
, dlength
+ 1));
715 return ((ssize_t
)-1);
720 field_convert_alpha_simple(field_t
*F
, line_rec_t
*L
, vchar_t delimiter
,
721 ssize_t data_offset
, ssize_t data_length
, ssize_t coll_offset
)
723 static char *compose
;
724 static ssize_t compose_length
;
729 if (compose_length
< (data_length
+ 1)) {
730 compose_length
= data_length
+ 1;
731 compose
= safe_realloc(compose
, compose_length
* sizeof (char));
734 (void) memcpy(compose
, L
->l_data
.sp
+ data_offset
, data_length
);
735 clength
= data_length
;
736 compose
[clength
] = '\0';
738 if ((dlength
= xfrm_ops
->sx_len(compose
, clength
)) <
739 L
->l_collate_bufsize
- coll_offset
)
740 return (xfrm_ops
->sx_xfrm(L
->l_collate
.sp
+ coll_offset
,
741 compose
, dlength
+ 1));
743 return ((ssize_t
)-1);
748 field_convert_alpha_wide(field_t
*F
, line_rec_t
*L
, vchar_t delimiter
,
749 ssize_t data_offset
, ssize_t data_length
, ssize_t coll_offset
)
751 wchar_t *compose
= safe_realloc(NULL
, (data_length
+ 1) *
758 for (i
= data_offset
; i
< data_offset
+ data_length
; i
++) {
759 wchar_t t
= (L
->l_data
.wp
)[i
];
761 if ((F
->f_options
& FIELD_IGNORE_NONPRINTABLES
) && !iswprint(t
))
764 if ((F
->f_options
& FIELD_DICTIONARY_ORDER
) && !iswalnum(t
) &&
768 if (F
->f_options
& FIELD_FOLD_UPPERCASE
)
771 compose
[clength
++] = t
;
773 compose
[clength
] = L
'\0';
775 dlength
= wcsxfrm(NULL
, compose
, (size_t)0);
776 if ((dlength
* sizeof (wchar_t)) < L
->l_collate_bufsize
-
777 coll_offset
* sizeof (wchar_t)) {
778 ret
= (ssize_t
)wcsxfrm(L
->l_collate
.wp
+ coll_offset
, compose
,
779 (size_t)dlength
+ 1);
790 * field_convert_numeric() converts the given field into a collatable numerical
791 * sequence. The sequence is ordered as { log, integer, separator, fraction },
792 * with an optional sentinel component at the sequence end.
796 field_convert_numeric(field_t
*F
, line_rec_t
*L
, vchar_t delimiter
,
797 ssize_t data_offset
, ssize_t data_length
, ssize_t coll_offset
)
800 char *buffer
= L
->l_collate
.sp
+ coll_offset
;
805 char *digits
= buffer
+ 1 + sizeof (int) / sizeof (char);
809 int state
= BEFORE_NUMBER
;
811 number
= L
->l_data
.sp
+ data_offset
;
812 length
= data_length
;
815 * Eat leading blanks, if any.
817 for (i
= 0; i
< length
; i
++)
818 if (!IS_BLANK(number
[i
]))
822 * Test that there is sufficient size in the collation buffer for our
823 * number. In addition to the possible remaining characters in the
824 * field, we also require space for the sign (char), logarithm (int),
825 * separator (char), and as many as two string terminators (for reverse
828 if (((length
- i
) + 4 * sizeof (char) + sizeof (int)) >
829 (L
->l_collate_bufsize
- coll_offset
))
830 return ((ssize_t
)-1);
833 * If negative, set sign.
835 if (number
[i
] == '-') {
841 * Scan integer part; eat leading zeros.
843 for (; i
< length
; i
++) {
844 if (IS_SEPARATOR(number
[i
]))
847 if (number
[i
] == '0' && !(state
& IN_NUMBER
))
850 if (!isdigit((uchar_t
)number
[i
]))
855 digits
[j
++] = '0' + '9' - number
[i
];
857 digits
[j
++] = number
[i
];
860 if (i
< length
&& IS_DECIMAL(number
[i
])) {
862 * Integer part terminated by decimal.
864 digits
[j
] = DECIMAL_CHAR
;
868 * Scan fractional part.
870 for (++i
; i
< length
; i
++) {
871 if (IS_SEPARATOR(number
[i
]))
874 if (!isdigit((uchar_t
)number
[i
]))
877 if (number
[i
] != '0')
881 digits
[j
++] = '0' + '9' - number
[i
];
883 digits
[j
++] = number
[i
];
887 digits
[j
++] = (char)(UCHAR_MAX
- INTERFIELD_SEPARATOR
);
890 * Nondigit or end of string seen.
894 digits
[j
++] = (char)(UCHAR_MAX
- INTERFIELD_SEPARATOR
);
896 digits
[j
] = INTERFIELD_SEPARATOR
;
899 if ((state
& IN_NUMBER
) == 0) {
901 * A non-zero number was not detected; treat as defined zero.
910 * We subtract a constant from the log of negative values so that
911 * they will correctly precede positive values with a zero logarithm.
915 log_ten
= -log_ten
- 2;
918 * Special case for -0.
926 * Place logarithm in big-endian form.
928 for (i
= 0; i
< sizeof (int); i
++)
929 buffer
[i
+ 1] = (log_ten
<< (i
* NBBY
))
930 >> ((sizeof (int) - 1) * NBBY
);
932 if (j
+ sizeof (char) + sizeof (int) <
933 L
->l_collate_bufsize
- coll_offset
)
934 return (j
+ 1 + sizeof (int));
936 return ((ssize_t
)-1);
941 field_convert_numeric_wide(field_t
*F
, line_rec_t
*L
, vchar_t delimiter
,
942 ssize_t data_offset
, ssize_t data_length
, ssize_t coll_offset
)
945 wchar_t *buffer
= L
->l_collate
.wp
+ coll_offset
;
951 wchar_t *digits
= buffer
+ 1 + sizeof (int)/sizeof (wchar_t);
955 int state
= BEFORE_NUMBER
;
957 number
= L
->l_data
.wp
+ data_offset
;
958 length
= data_length
;
960 for (i
= 0; i
< length
; i
++)
961 if (!W_IS_BLANK(number
[i
]))
964 if (((length
- i
) * sizeof (wchar_t) + 4 * sizeof (wchar_t) +
965 sizeof (int)) > (L
->l_collate_bufsize
- coll_offset
))
966 return ((ssize_t
)-1);
968 if (number
[i
] == L
'-') {
973 for (; i
< length
; i
++) {
974 if (W_IS_SEPARATOR(number
[i
]))
977 if (number
[i
] == L
'0' && !(state
& IN_NUMBER
))
980 if (!iswdigit(number
[i
]))
985 digits
[j
++] = L
'0' + L
'9' - number
[i
];
987 digits
[j
++] = number
[i
];
990 if (i
< length
&& W_IS_DECIMAL(number
[i
])) {
991 digits
[j
] = W_DECIMAL_CHAR
;
994 for (++i
; i
< length
; i
++) {
995 if (W_IS_SEPARATOR(number
[i
]))
998 if (!iswdigit(number
[i
]))
1001 if (number
[i
] != L
'0')
1005 digits
[j
++] = L
'0' + L
'9' - number
[i
];
1007 digits
[j
++] = number
[i
];
1011 digits
[j
++] = (wchar_t)(WCHAR_MAX
-
1012 W_INTERFIELD_SEPARATOR
);
1016 digits
[j
++] = (wchar_t)(WCHAR_MAX
-
1017 W_INTERFIELD_SEPARATOR
);
1019 digits
[j
] = W_INTERFIELD_SEPARATOR
;
1022 if ((state
& IN_NUMBER
) == 0) {
1031 log_ten
= -log_ten
- 2;
1038 * Place logarithm in big-endian form.
1040 lbuffer
= (char *)(buffer
+ 1);
1041 for (i
= 0; i
< sizeof (int); i
++)
1042 lbuffer
[i
] = (log_ten
<< (i
* NBBY
))
1043 >> ((sizeof (int) - 1) * NBBY
);
1045 if ((j
+ 1 + sizeof (int)/sizeof (wchar_t)) * sizeof (wchar_t) <
1046 L
->l_collate_bufsize
- coll_offset
* sizeof (wchar_t))
1047 return (j
+ 1 + sizeof (int) / sizeof (wchar_t));
1049 return ((ssize_t
)-1);
1053 * flags contains one of CV_REALLOC, CV_FAIL, specifying the preferred behaviour
1054 * when coll_offset exceeds l_collate_bufsize.
1057 field_convert(field_t
*F
, line_rec_t
*L
, int flags
, vchar_t field_separator
)
1059 ssize_t coll_offset
= 0;
1060 ssize_t start
, end
, distance
;
1061 field_t
*cur_fieldp
= F
;
1063 while (cur_fieldp
!= NULL
) {
1067 if (!field_separator
.sc
)
1068 field_delimit(cur_fieldp
, L
, &start
, &end
);
1070 field_delimit_tabbed(cur_fieldp
, L
, &start
, &end
,
1074 if (end
- start
> 0 ||
1075 (end
- start
== 0 && F
->f_species
== NUMERIC
)) {
1077 * Convert field, appending to collated field of line
1080 distance
= cur_fieldp
->f_convert(cur_fieldp
, L
,
1081 field_separator
, start
, end
- start
, coll_offset
);
1084 * branch should execute comparatively rarely
1086 if (distance
== -1) {
1087 if (flags
& FCV_REALLOC
) {
1088 ASSERT(L
->l_collate_bufsize
> 0);
1089 L
->l_collate_bufsize
*= 2;
1091 safe_realloc(L
->l_collate
.sp
,
1092 L
->l_collate_bufsize
);
1094 __S(stats_incr_convert_reallocs());
1098 * FCV_FAIL has been set.
1105 if (cur_fieldp
->f_options
& FIELD_REVERSE_COMPARISONS
) {
1106 xstrninv(L
->l_collate
.sp
, coll_offset
, distance
);
1107 *(L
->l_collate
.sp
+ coll_offset
+ distance
) =
1108 (char)(UCHAR_MAX
- INTERFIELD_SEPARATOR
);
1112 ASSERT(distance
>= 0);
1113 coll_offset
+= distance
;
1114 if (coll_offset
>= L
->l_collate_bufsize
) {
1115 if (flags
& FCV_REALLOC
) {
1116 ASSERT(L
->l_collate_bufsize
> 0);
1117 L
->l_collate_bufsize
*= 2;
1118 L
->l_collate
.sp
= safe_realloc(L
->l_collate
.sp
,
1119 L
->l_collate_bufsize
);
1121 __S(stats_incr_convert_reallocs());
1126 *(L
->l_collate
.sp
+ coll_offset
) = INTERFIELD_SEPARATOR
;
1129 cur_fieldp
= cur_fieldp
->f_next
;
1132 L
->l_collate_length
= coll_offset
;
1134 return (L
->l_collate_length
);
1138 field_convert_wide(field_t
*F
, line_rec_t
*L
, int flags
,
1139 vchar_t field_separator
)
1141 ssize_t coll_offset
= 0;
1142 ssize_t start
, end
, distance
;
1143 field_t
*cur_fieldp
= F
;
1145 while (cur_fieldp
!= NULL
) {
1146 if (!field_separator
.wc
)
1147 field_delimit_wide(cur_fieldp
, L
, &start
, &end
);
1149 field_delimit_tabbed_wide(cur_fieldp
, L
, &start
, &end
,
1153 if (end
- start
> 0 ||
1154 end
- start
== 0 && F
->f_species
== NUMERIC
) {
1155 distance
= cur_fieldp
->f_convert(cur_fieldp
, L
,
1156 field_separator
, start
, end
- start
, coll_offset
);
1158 if (distance
== -1) {
1159 if (flags
& FCV_REALLOC
) {
1160 ASSERT(L
->l_collate_bufsize
> 0);
1161 L
->l_collate_bufsize
*= 2;
1162 L
->l_collate
.wp
= safe_realloc(
1164 L
->l_collate_bufsize
);
1166 __S(stats_incr_convert_reallocs());
1174 if (cur_fieldp
->f_options
& FIELD_REVERSE_COMPARISONS
) {
1175 xwcsninv(L
->l_collate
.wp
, coll_offset
, distance
);
1176 *(L
->l_collate
.wp
+ coll_offset
+ distance
) =
1177 WCHAR_MAX
- INTERFIELD_SEPARATOR
;
1181 ASSERT(distance
>= 0);
1182 coll_offset
+= distance
;
1183 if (coll_offset
* sizeof (wchar_t) >= L
->l_collate_bufsize
) {
1184 if (flags
& FCV_REALLOC
) {
1185 ASSERT(L
->l_collate_bufsize
> 0);
1186 L
->l_collate_bufsize
*= 2;
1187 L
->l_collate
.wp
= safe_realloc(L
->l_collate
.wp
,
1188 L
->l_collate_bufsize
);
1190 __S(stats_incr_convert_reallocs());
1195 *(L
->l_collate
.wp
+ coll_offset
) = W_INTERFIELD_SEPARATOR
;
1198 cur_fieldp
= cur_fieldp
->f_next
;
1201 L
->l_collate_length
= coll_offset
* sizeof (wchar_t);
1202 #ifdef _LITTLE_ENDIAN
1203 xwcsntomsb(L
->l_collate
.wp
, coll_offset
);
1204 #endif /* _LITTLE_ENDIAN */
1206 return (L
->l_collate_length
);
1210 * line_convert() and line_convert_wide() are called when the collation vector
1211 * of a given line has been exhausted, and we are performing the final,
1212 * full-line comparison required by the sort specification. Because we do not
1213 * have a guarantee that l_data is null-terminated, we create an explicitly
1214 * null-terminated copy suitable for transformation to a collatable form for the
1218 line_convert(line_rec_t
*L
)
1220 static ssize_t bufsize
;
1221 static char *buffer
;
1223 if (L
->l_raw_collate
.sp
!= NULL
)
1226 if (L
->l_data_length
+ 1 > bufsize
) {
1227 buffer
= safe_realloc(buffer
, L
->l_data_length
+ 1);
1228 bufsize
= L
->l_data_length
+ 1;
1231 (void) strncpy(buffer
, L
->l_data
.sp
, L
->l_data_length
);
1232 buffer
[L
->l_data_length
] = '\0';
1234 L
->l_raw_collate
.sp
= safe_realloc(L
->l_raw_collate
.sp
,
1235 xfrm_ops
->sx_len(buffer
, L
->l_data_length
) + 1);
1236 xfrm_ops
->sx_xfrm(L
->l_raw_collate
.sp
, buffer
,
1237 xfrm_ops
->sx_len(buffer
, L
->l_data_length
) + 1);
1239 __S(stats_incr_line_conversions());
1243 line_convert_wide(line_rec_t
*L
)
1245 static wchar_t *buffer
;
1246 static ssize_t bufsize
;
1250 if (L
->l_raw_collate
.wp
!= NULL
)
1253 if (L
->l_data_length
+ 1 > bufsize
) {
1254 buffer
= safe_realloc(buffer
, (L
->l_data_length
+ 1) *
1256 bufsize
= L
->l_data_length
+ 1;
1259 (void) wcsncpy(buffer
, L
->l_data
.wp
, L
->l_data_length
);
1260 buffer
[L
->l_data_length
] = L
'\0';
1262 dlength
= wcsxfrm(NULL
, buffer
, 0) + 1;
1263 L
->l_raw_collate
.wp
= safe_realloc(L
->l_raw_collate
.wp
, dlength
*
1265 (void) wcsxfrm(L
->l_raw_collate
.wp
, buffer
, dlength
);
1267 __S(stats_incr_line_conversions());
1271 * Our convention for collation is
1277 * This convention is consistent with the definition of memcmp(), strcmp(), and
1278 * strncmp() in the C locale. collated() and collated_wide() have two optional
1279 * behaviours, which can be activated by setting the appropriate values in
1280 * coll_flag: COLL_UNIQUE, which returns 0 if the l_collate fields of the line
1281 * records being compared are identical; COLL_DATA_ONLY, which ignores the
1282 * l_collate field for the current comparison; and COLL_REVERSE, which flips the
1283 * result for comparisons that fall through to an actual data comparison (since
1284 * the collated vector should already reflect reverse ordering from field
1288 collated(line_rec_t
*A
, line_rec_t
*B
, ssize_t depth
, flag_t coll_flag
)
1290 ssize_t ml
= MIN(A
->l_collate_length
, B
->l_collate_length
) - depth
;
1292 int mask
= (coll_flag
& COLL_REVERSE
) ? INT_SIGN_FLIP_MASK
:
1296 if (!(coll_flag
& COLL_DATA_ONLY
)) {
1298 r
= memcmp(A
->l_collate
.sp
+ depth
,
1299 B
->l_collate
.sp
+ depth
, ml
);
1305 if (A
->l_collate_length
< B
->l_collate_length
)
1308 if (A
->l_collate_length
> B
->l_collate_length
)
1313 * This is where we cut out, if we know that the current sort is over
1316 if (coll_flag
& COLL_UNIQUE
)
1322 la
= strlen(A
->l_raw_collate
.sp
);
1323 lb
= strlen(B
->l_raw_collate
.sp
);
1325 r
= memcmp(A
->l_raw_collate
.sp
, B
->l_raw_collate
.sp
, MIN(la
, lb
));
1340 collated_wide(line_rec_t
*A
, line_rec_t
*B
, ssize_t depth
, flag_t coll_flag
)
1342 ssize_t ml
= MIN(A
->l_collate_length
, B
->l_collate_length
) - depth
;
1344 int mask
= (coll_flag
& COLL_REVERSE
) ? INT_SIGN_FLIP_MASK
:
1348 if (!(coll_flag
& COLL_DATA_ONLY
)) {
1350 r
= memcmp(A
->l_collate
.sp
+ depth
,
1351 B
->l_collate
.sp
+ depth
, ml
);
1356 if (A
->l_collate_length
< B
->l_collate_length
)
1359 if (A
->l_collate_length
> B
->l_collate_length
)
1363 if (coll_flag
& COLL_UNIQUE
)
1366 line_convert_wide(A
);
1367 line_convert_wide(B
);
1369 la
= wcslen(A
->l_raw_collate
.wp
);
1370 lb
= wcslen(B
->l_raw_collate
.wp
);
1372 r
= wmemcmp(A
->l_raw_collate
.wp
, B
->l_raw_collate
.wp
,
1373 (size_t)MIN(la
, lb
));