usr.bin/sort/fields.c

   1 /*      $NetBSD: fields.c,v 1.30 2009/10/07 21:02:57 dsl Exp $  */
   2
   3 /*-
   4  * Copyright (c) 2000-2003 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Ben Harris and Jaromir Dolecek.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*-
  33  * Copyright (c) 1993
  34  *      The Regents of the University of California.  All rights reserved.
  35  *
  36  * This code is derived from software contributed to Berkeley by
  37  * Peter McIlroy.
  38  *
  39  * Redistribution and use in source and binary forms, with or without
  40  * modification, are permitted provided that the following conditions
  41  * are met:
  42  * 1. Redistributions of source code must retain the above copyright
  43  *    notice, this list of conditions and the following disclaimer.
  44  * 2. Redistributions in binary form must reproduce the above copyright
  45  *    notice, this list of conditions and the following disclaimer in the
  46  *    documentation and/or other materials provided with the distribution.
  47  * 3. Neither the name of the University nor the names of its contributors
  48  *    may be used to endorse or promote products derived from this software
  49  *    without specific prior written permission.
  50  *
  51  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  52  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  53  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  54  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  55  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  56  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  57  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  58  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  59  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  60  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  61  * SUCH DAMAGE.
  62  */
  63
  64 /* Subroutines to generate sort keys. */
  65
  66 #include "sort.h"
  67
  68 __RCSID("$NetBSD: fields.c,v 1.30 2009/10/07 21:02:57 dsl Exp $");
  69
  70 #define SKIP_BLANKS(ptr) {                                      \
  71         if (BLANK & d_mask[*(ptr)])                             \
  72                 while (BLANK & d_mask[*(++(ptr))]);             \
  73 }
  74
  75 #define NEXTCOL(pos) {                                          \
  76         if (!SEP_FLAG)                                          \
  77                 while (BLANK & l_d_mask[*(++pos)]);             \
  78         while ((*(pos+1) != '\0') && !((FLD_D | REC_D_F) & l_d_mask[*++pos]));\
  79 }
  80
  81 static u_char *enterfield(u_char *, const u_char *, struct field *, int);
  82 static u_char *number(u_char *, const u_char *, u_char *, u_char *, int);
  83
  84 #define DECIMAL_POINT '.'
  85
  86 /*
  87  * constructs sort key with leading recheader, followed by the key,
  88  * followed by the original line.
  89  */
  90 length_t
  91 enterkey(RECHEADER *keybuf, const u_char *keybuf_end, u_char *line_data,
  92     size_t line_size, struct field fieldtable[])
  93         /* keybuf:       pointer to start of key */
  94 {
  95         int i;
  96         u_char *l_d_mask;
  97         u_char *lineend, *pos;
  98         const u_char *endkey;
  99         u_char *keypos;
 100         struct coldesc *clpos;
 101         int col = 1;
 102         struct field *ftpos;
 103
 104         l_d_mask = d_mask;
 105         pos = line_data - 1;
 106         lineend = line_data + line_size-1;
 107                                 /* don't include rec_delimiter */
 108
 109         for (i = 0; i < ncols; i++) {
 110                 clpos = clist + i;
 111                 for (; (col < clpos->num) && (pos < lineend); col++) {
 112                         NEXTCOL(pos);
 113                 }
 114                 if (pos >= lineend)
 115                         break;
 116                 clpos->start = SEP_FLAG ? pos + 1 : pos;
 117                 NEXTCOL(pos);
 118                 clpos->end = pos;
 119                 col++;
 120                 if (pos >= lineend) {
 121                         clpos->end = lineend;
 122                         i++;
 123                         break;
 124                 }
 125         }
 126         for (; i <= ncols; i++)
 127                 clist[i].start = clist[i].end = lineend;
 128         if (clist[0].start < line_data)
 129                 clist[0].start++;
 130
 131         /*
 132          * We write the sort keys (concatenated) followed by the
 133          * original line data (for output) as the 'keybuf' data.
 134          * keybuf->length is the number of key bytes + data bytes.
 135          * keybuf->offset is the number of key bytes.
 136          * We add a record separator weight after the key in case
 137          * (as is usual) we need to preserve the order of equal lines,
 138          * and for 'sort -u'.
 139          * The key itself will have had the correct weight applied.
 140          */
 141         keypos = keybuf->data;
 142         endkey = keybuf_end - line_size - 1;
 143         if (endkey <= keypos)
 144                 /* No room for any key bytes */
 145                 return 1;
 146
 147         for (ftpos = fieldtable + 1; ftpos->icol.num; ftpos++) {
 148                 if ((keypos = enterfield(keypos, endkey, ftpos,
 149                     fieldtable->flags)) == NULL)
 150                         return (1);
 151         }
 152
 153         keybuf->offset = keypos - keybuf->data;
 154         keybuf->length = keybuf->offset + line_size;
 155
 156         /*
 157          * Posix requires that equal keys be further sorted by the
 158          * entire original record.
 159          * NetBSD has (at least for some time) kept equal keys in
 160          * their original order.
 161          * For 'sort -u' posix_sort is unset.
 162          */
 163         keybuf->keylen = posix_sort ? keybuf->length : keybuf->offset;
 164
 165         memcpy(keypos, line_data, line_size);
 166         return (0);
 167 }
 168
 169 /*
 170  * constructs a field (as defined by -k) within a key
 171  */
 172 static u_char *
 173 enterfield(u_char *tablepos, const u_char *endkey, struct field *cur_fld,
 174     int gflags)
 175 {
 176         u_char *start, *end, *lineend, *mask, *lweight;
 177         struct column icol, tcol;
 178         u_int flags;
 179
 180         icol = cur_fld->icol;
 181         tcol = cur_fld->tcol;
 182         flags = cur_fld->flags;
 183         start = icol.p->start;
 184         lineend = clist[ncols].end;
 185         if (flags & BI)
 186                 SKIP_BLANKS(start);
 187         start += icol.indent;
 188         start = min(start, lineend);
 189
 190         if (!tcol.num)
 191                 end = lineend;
 192         else {
 193                 if (tcol.indent) {
 194                         end = tcol.p->start;
 195                         if (flags & BT)
 196                                 SKIP_BLANKS(end);
 197                         end += tcol.indent;
 198                         end = min(end, lineend);
 199                 } else
 200                         end = tcol.p->end;
 201         }
 202
 203         if (flags & N)
 204                 return number(tablepos, endkey, start, end, flags);
 205
 206         /* Bound check space - assuming nothing is skipped */
 207         if (tablepos + (end - start) + 1 >= endkey)
 208                 return NULL;
 209
 210         mask = cur_fld->mask;
 211         lweight = cur_fld->weights;
 212         for (; start < end; start++) {
 213                 if (!mask || mask[*start]) {
 214                         *tablepos++ = lweight[*start];
 215                 }
 216         }
 217         /* Add extra byte (absent from lweight) to sort short keys correctly */
 218         *tablepos++ = lweight[REC_D];
 219         return tablepos;
 220 }
 221
 222 /*
 223  * Numbers are converted to a floating point format (exponent & mantissa)
 224  * so that they compare correctly as sequence of unsigned bytes.
 225  * Bytes 0x00 and 0xff are used to terminate positive and negative numbers
 226  * to ensure that 0.123 sorts after 0.12 and -0.123 sorts before -0.12.
 227  *
 228  * The first byte contain the overall sign, exponent sign and some of the
 229  * exponent. These have to be ordered (-ve value, decreasing exponent),
 230  * zero, (+ve value, increasing exponent).
 231  *
 232  * The first byte is 0x80 for zero, 0xc0 for +ve with exponent 0.
 233  * -ve values are the 1's compliments (so 0x7f isn't used!).
 234  *
 235  * This only leaves 63 byte values for +ve exponents - which isn't enough.
 236  * The largest 4 exponent values are used to hold a byte count of the
 237  * number of following bytes that contain 8 exponent bits per byte,
 238  * This lets us sort exponents from -2^31 to +2^31.
 239  *
 240  * The mantissa is stored 2 digits per byte offset by 0x40, for negative
 241  * numbers the order must be reversed (they are bit inverted).
 242  *
 243  * Reverse sorts are done by inverting the sign of the number.
 244  */
 245 #define MAX_EXP_ENC  ((int)sizeof(int))
 246
 247 static u_char *
 248 number(u_char *pos, const u_char *bufend, u_char *line, u_char *lineend,
 249     int reverse)
 250 {
 251         int exponent = -1;
 252         int had_dp = 0;
 253         u_char *tline;
 254         char ch;
 255         unsigned int val;
 256         u_char *last_nz_pos;
 257         u_char negate;
 258
 259         if (reverse & R)
 260                 negate = 0xff;
 261         else
 262                 negate = 0;
 263
 264         /* Give ourselves space for the key terminator */
 265         bufend--;
 266
 267         /* Ensure we have enough space for the exponent */
 268         if (pos + 1 + MAX_EXP_ENC > bufend)
 269                 return (NULL);
 270
 271         SKIP_BLANKS(line);
 272         if (*line == '-') {     /* set the sign */
 273                 negate ^= 0xff;
 274                 line++;
 275         }
 276         /* eat initial zeroes */
 277         for (; *line == '0' && line < lineend; line++)
 278                 continue;
 279
 280         /* calculate exponents */
 281         if (*line == DECIMAL_POINT) {
 282                 /* Decimal fraction */
 283                 had_dp = 1;
 284                 while (*++line == '0' && line < lineend)
 285                         exponent--;
 286         } else {
 287                 /* Large (absolute) value, count digits */
 288                 for (tline = line; *tline >= '0' &&
 289                     *tline <= '9' && tline < lineend; tline++)
 290                         exponent++;
 291         }
 292
 293         /* If the first/next character isn't a digit, value is zero */
 294         if (*line < '1' || *line > '9' || line >= lineend) {
 295                 /* This may be "0", "0.00", "000" or "fubar" but sorts as 0 */
 296                 /* XXX what about NaN, NAN, inf and INF */
 297                 *pos++ = 0x80;
 298                 return pos;
 299         }
 300
 301         /* Maybe here we should allow for e+12 (etc) */
 302
 303         if (exponent < 0x40 - MAX_EXP_ENC && -exponent < 0x40 - MAX_EXP_ENC) {
 304                 /* Value ok for simple encoding */
 305                 /* exponent 0 is 0xc0 for +ve numbers and 0x40 for -ve ones */
 306                 exponent += 0xc0;
 307                 *pos++ = negate ^ exponent;
 308         } else {
 309                 /* Out or range for a single byte */
 310                 int c, t;
 311                 t = exponent > 0 ? exponent : -exponent;
 312                 /* Count how many 8-bit bytes are needed */
 313                 for (c = 0; ; c++) {
 314                         t >>= 8;
 315                         if (t == 0)
 316                                 break;
 317                 }
 318                 /* 'c' better be 0..3 here - but probably 0..1 */
 319                 /* Offset just outside valid range */
 320                 t = c + 0x40 - MAX_EXP_ENC;
 321                 if (exponent < 0)
 322                         t = -t;
 323                 *pos++ = negate ^ (t + 0xc0);
 324                 /* now add each byte, most significant first */
 325                 for (; c >= 0; c--)
 326                         *pos++ = negate ^ (exponent >> (c * 8));
 327         }
 328
 329         /* Finally add mantissa, 2 digits per byte */
 330         for (last_nz_pos = pos; line < lineend; ) {
 331                 if (pos >= bufend)
 332                         return NULL;
 333                 ch = *line++;
 334                 val = (ch - '0') * 10;
 335                 if (val > 90) {
 336                         if (ch == DECIMAL_POINT && !had_dp) {
 337                                 had_dp = 1;
 338                                 continue;
 339                         }
 340                         break;
 341                 }
 342                 while (line < lineend) {
 343                         ch = *line++;
 344                         if (ch == DECIMAL_POINT && !had_dp) {
 345                                 had_dp = 1;
 346                                 continue;
 347                         }
 348                         if (ch < '0' || ch > '9')
 349                                 line = lineend;
 350                         else
 351                                 val += ch - '0';
 352                         break;
 353                 }
 354                 *pos++ = negate ^ (val + 0x40);
 355                 if (val != 0)
 356                         last_nz_pos = pos;
 357         }
 358
 359         /* Add key terminator, deleting any trailing "00" */
 360         *last_nz_pos++ = negate;
 361
 362         return (last_nz_pos);
 363 }