usr/src/cmd/spell/huff.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29
  30 #pragma ident   "%Z%%M% %I%     %E% SMI"
  31
  32
  33 #include <unistd.h>
  34 #include <stdlib.h>
  35 #include <stdio.h>
  36
  37 #define BYTE 8
  38 #define QW 1            /* width of bas-q digit in bits */
  39
  40 /*
  41  * this stuff should be local and hidden; it was made
  42  * accessible outside for dirty reasons: 20% faster spell
  43  */
  44 #include "huff.h"
  45 struct huff huffcode;
  46
  47 /*
  48  * Infinite Huffman code
  49  *
  50  * Let the messages be exponentially distributed with ratio r:
  51  *      P {message k} = r^k*(1-r),      k = 0, 1, ...
  52  * Let the messages be coded in base q, and suppose
  53  *      r^n = 1/q
  54  * If each decade(base q) contains n codes, then
  55  * the messages assigned to each decade will be q times
  56  * as probable as the next. Moreover the code for the tail of
  57  * the distribution after truncating one decade should look
  58  * just like the original, but longer by one leading digit q-1.
  59  *      q(z+n) = z + (q-1)q^w
  60  * where z is first code of decade, w is width of code, in shortest
  61  * full decade. Examples, base 2:
  62  *      r^1 = 1/2       r^5 = 1/2
  63  *      0               0110
  64  *      10              0111
  65  *      110             1000
  66  *      1110            1001
  67  *      ...             1010
  68  *                      10110
  69  *      w = 1, z = 0            w = 4, z = 0110
  70  * Rewriting slightly
  71  *      (q-1)z + q*n = (q-1)q^w
  72  * whence z is a multiple of q and n is a multiple of q-1. Let
  73  *      z = cq, n = d(q-1)
  74  * We pick w to be the least integer such that
  75  *      d = n/(q-1) <= q^(w-1)
  76  * Then solve for c
  77  *      c = q^(w-1) - d
  78  * If c is not zero, the first decade may be preceded by
  79  * even shorter (w-1)-digit codes 0, 1, ..., c-1. Thus
  80  * the example code with r^5 = 1/2 becomes
  81  *      000
  82  *      001
  83  *      010
  84  *      0110
  85  *      0111
  86  *      1000
  87  *      1001
  88  *      1010
  89  *      10110
  90  *      ...
  91  *      110110
  92  *      ...
  93  * The expected number of base-q digits in a codeword is then
  94  *      w - 1 + r^c/(1-r^n)
  95  * The present routines require q to be a power of 2
  96  */
  97 /*
  98  * There is a lot of hanky-panky with left justification against
  99  * sign instead of simple left justification because
 100  * unsigned long is not available
 101  */
 102 #define L (BYTE*(sizeof (long))-1)      /* length of signless long */
 103 #define MASK (~((unsigned long)1<<L))   /* mask out sign */
 104
 105 /*
 106  * decode the prefix of word y (which is left justified against sign)
 107  * place mesage number into place pointed to by kp
 108  * return length (in bits) of decoded prefix or 0 if code is out of
 109  * range
 110  */
 111 int
 112 decode(long y, long *pk)
 113 {
 114         int l;
 115         long v;
 116         if (y < cs) {
 117                 *pk = y >> (long)(L+QW-w);
 118                 return (w-QW);
 119         }
 120         for (l = w, v = v0; y >= qcs;
 121             y = ((unsigned long)y << QW) & MASK, v += n)
 122                 if ((l += QW) > L)
 123                         return (0);
 124         *pk = v + (y>>(long)(L-w));
 125         return (l);
 126 }
 127
 128 /*
 129  * encode message k and put result (right justified) into
 130  * place pointed to by py.
 131  * return length (in bits) of result,
 132  * or 0 if code is too long
 133  */
 134
 135 int
 136 encode(long k, long *py)
 137 {
 138         int l;
 139         long y;
 140         if (k < c) {
 141                 *py = k;
 142                 return (w-QW);
 143         }
 144         for (k -= c, y = 1, l = w; k >= n; k -= n, y <<= QW)
 145                 if ((l += QW) > L)
 146                         return (0);
 147         *py = ((y-1)<<w) + cq + k;
 148         return (l);
 149 }
 150
 151
 152 /*
 153  * Initialization code, given expected value of k
 154  * E(k) = r/(1-r) = a
 155  * and given base width b
 156  * return expected length of coded messages
 157  */
 158 static struct qlog {
 159         long p;
 160         double u;
 161 } z;
 162
 163 static struct qlog
 164 qlog(double x, double y, long p, double u)      /* find smallest p so x^p<=y */
 165 {
 166
 167         if (u/x <= y) {
 168                 z.p = 0;
 169                 z.u = 1;
 170         } else {
 171                 z = qlog(x, y, p+p, u*u);
 172                 if (u*z.u/x > y) {
 173                         z.p += p;
 174                         z.u *= u;
 175                 }
 176         }
 177         return (z);
 178 }
 179
 180 double
 181 huff(float a)
 182 {
 183         int i, q;
 184         long d, j;
 185         double r = a/(1.0 + a);
 186         double rc, rq;
 187
 188         for (i = 0, q = 1, rq = r; i < QW; i++, q *= 2, rq *= rq)
 189                 continue;
 190         rq /= r;        /* rq = r^(q-1) */
 191         (void) qlog(rq, 1./q, 1L, rq);
 192         d = z.p;
 193         n = d*(q-1);
 194         if (n != d * (q - 1))
 195                 abort();        /* time to make n long */
 196         for (w = QW, j = 1; j < d; w += QW, j *= q)
 197                 continue;
 198         c = j - d;
 199         cq = c*q;
 200         cs = cq<<(L-w);
 201         qcs = (((long)(q-1)<<w) + cq) << (L-QW-w);
 202         v0 = c - cq;
 203         for (i = 0, rc = 1; i < c; i++, rc *= r)        /* rc = r^c */
 204                 continue;
 205         return (w + QW*(rc/(1-z.u) - 1));
 206 }
 207
 208 void
 209 whuff(void)
 210 {
 211         (void) fwrite((char *) & huffcode, sizeof (huffcode), 1, stdout);
 212 }
 213
 214 int
 215 rhuff(FILE *f)
 216 {
 217         return (read(fileno(f), (char *)&huffcode, sizeof (huffcode)) ==
 218             sizeof (huffcode));
 219 }