xapian-core/api/sortable-serialise.cc

   1 /** @file
   2  * @brief Serialise floating point values to string which sort the same way.
   3  */
   4 /* Copyright (C) 2007,2009,2015,2016 Olly Betts
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <config.h>
  22
  23 #include <xapian/queryparser.h>
  24
  25 // Disable these assertions when building the library as these functions are
  26 // marked not to throw exceptions in the API headers.  In unittest we define
  27 // UNITTEST_ASSERT_NOTHROW to set a variable to the exception message and
  28 // return, then the harness checks if that variable has been set.
  29 #ifndef XAPIAN_UNITTEST
  30 # define UNITTEST_ASSERT_NOTHROW(COND,RET)
  31 #endif
  32
  33 #include <cfloat>
  34 #include <cmath>
  35 #include <cstring>
  36
  37 #include <string>
  38
  39 using namespace std;
  40
  41 #if FLT_RADIX != 2
  42 # error Code currently assumes FLT_RADIX == 2
  43 #endif
  44
  45 #ifdef _MSC_VER
  46 // Disable warning about negating an unsigned type, which we do deliberately.
  47 # pragma warning(disable:4146)
  48 #endif
  49
  50 size_t
  51 Xapian::sortable_serialise_(double value, char * buf) XAPIAN_NOEXCEPT
  52 {
  53     double mantissa;
  54     int exponent;
  55
  56     // Negative infinity.
  57     if (value < -DBL_MAX) return 0;
  58
  59     mantissa = frexp(value, &exponent);
  60
  61     /* Deal with zero specially.
  62      *
  63      * IEEE representation of doubles uses 11 bits for the exponent, with a
  64      * bias of 1023.  We bias this by subtracting 8, and non-IEEE
  65      * representations may allow higher exponents, so allow exponents down to
  66      * -2039 - if smaller exponents are possible anywhere, we underflow such
  67      *  numbers to 0.
  68      */
  69     if (mantissa == 0.0 || exponent < -2039) {
  70         *buf = '\x80';
  71         return 1;
  72     }
  73
  74     bool negative = (mantissa < 0);
  75     if (negative) mantissa = -mantissa;
  76
  77     // Infinity, or extremely large non-IEEE representation.
  78     if (value > DBL_MAX || exponent > 2055) {
  79         if (negative) {
  80             // This can only happen with a non-IEEE representation, because
  81             // we've already tested for value < -DBL_MAX
  82             return 0;
  83         } else {
  84             memset(buf, '\xff', 9);
  85             return 9;
  86         }
  87     }
  88
  89     // Encoding:
  90     //
  91     // [ 7 | 6 | 5 | 4 3 2 1 0]
  92     //   Sm  Se  Le
  93     //
  94     // Sm stores the sign of the mantissa: 1 = positive or zero, 0 = negative.
  95     // Se stores the sign of the exponent: Sm for positive/zero, !Sm for neg.
  96     // Le stores the length of the exponent: !Se for 3 bits, Se for 11 bits.
  97     unsigned char next = (negative ? 0 : 0xe0);
  98
  99     // Bias the exponent by 8 so that more small integers get short encodings.
 100     exponent -= 8;
 101     bool exponent_negative = (exponent < 0);
 102     if (exponent_negative) {
 103         exponent = -exponent;
 104         next ^= 0x60;
 105     }
 106
 107     size_t len = 0;
 108
 109     /* We store the exponent in 3 or 11 bits.  If the number is negative, we
 110      * flip all the bits of the exponent, since larger negative numbers should
 111      * sort first.
 112      *
 113      * If the exponent is negative, we flip the bits of the exponent, since
 114      * larger negative exponents should sort first (unless the number is
 115      * negative, in which case they should sort later).
 116      */
 117     UNITTEST_ASSERT_NOTHROW(exponent >= 0, 0);
 118     if (exponent < 8) {
 119         next ^= 0x20;
 120         next |= static_cast<unsigned char>(exponent << 2);
 121         if (negative ^ exponent_negative) next ^= 0x1c;
 122     } else {
 123         UNITTEST_ASSERT_NOTHROW((exponent >> 11) == 0, 0);
 124         // Put the top 5 bits of the exponent into the lower 5 bits of the
 125         // first byte:
 126         next |= static_cast<unsigned char>(exponent >> 6);
 127         if (negative ^ exponent_negative) next ^= 0x1f;
 128         buf[len++] = next;
 129         // And the lower 6 bits of the exponent go into the upper 6 bits
 130         // of the second byte:
 131         next = static_cast<unsigned char>(exponent << 2);
 132         if (negative ^ exponent_negative) next ^= 0xfc;
 133     }
 134
 135     // Convert the 52 (or 53) bits of the mantissa into two 32-bit words.
 136     mantissa *= 1 << (negative ? 26 : 27);
 137     unsigned word1 = static_cast<unsigned>(mantissa);
 138     mantissa -= word1;
 139     unsigned word2 = static_cast<unsigned>(mantissa * 4294967296.0); // 1<<32
 140     // If the number is positive, the first bit will always be set because 0.5
 141     // <= mantissa < 1, unless mantissa is zero, which we handle specially
 142     // above).  If the number is negative, we negate the mantissa instead of
 143     // flipping all the bits, so in the case of 0.5, the first bit isn't set
 144     // so we need to store it explicitly.  But for the cost of one extra
 145     // leading bit, we can save several trailing 0xff bytes in lots of common
 146     // cases.
 147     UNITTEST_ASSERT_NOTHROW(negative || (word1 & (1<<26)), 0);
 148     if (negative) {
 149         // We negate the mantissa for negative numbers, so that the sort order
 150         // is reversed (since larger negative numbers should come first).
 151         word1 = -word1;
 152         if (word2 != 0) ++word1;
 153         word2 = -word2;
 154     }
 155
 156     word1 &= 0x03ffffff;
 157     next |= static_cast<unsigned char>(word1 >> 24);
 158     buf[len++] = next;
 159     buf[len++] = char(word1 >> 16);
 160     buf[len++] = char(word1 >> 8);
 161     buf[len++] = char(word1);
 162
 163     buf[len++] = char(word2 >> 24);
 164     buf[len++] = char(word2 >> 16);
 165     buf[len++] = char(word2 >> 8);
 166     buf[len++] = char(word2);
 167
 168     // Finally, we can chop off any trailing zero bytes.
 169     while (len > 0 && buf[len - 1] == '\0') {
 170         --len;
 171     }
 172
 173     return len;
 174 }
 175
 176 /// Get a number from the character at a given position in a string, returning
 177 /// 0 if the string isn't long enough.
 178 static inline unsigned char
 179 numfromstr(const std::string & str, std::string::size_type pos)
 180 {
 181     return (pos < str.size()) ? static_cast<unsigned char>(str[pos]) : '\0';
 182 }
 183
 184 double
 185 Xapian::sortable_unserialise(const std::string & value) XAPIAN_NOEXCEPT
 186 {
 187     // Zero.
 188     if (value.size() == 1 && value[0] == '\x80') return 0.0;
 189
 190     // Positive infinity.
 191     if (value.size() == 9 &&
 192         memcmp(value.data(), "\xff\xff\xff\xff\xff\xff\xff\xff\xff", 9) == 0) {
 193 #ifdef INFINITY
 194         // INFINITY is C99.  Oddly, it's of type "float" so sanity check in
 195         // case it doesn't cast to double as infinity (apparently some
 196         // implementations have this problem).
 197         if (double(INFINITY) > HUGE_VAL) return INFINITY;
 198 #endif
 199         return HUGE_VAL;
 200     }
 201
 202     // Negative infinity.
 203     if (value.empty()) {
 204 #ifdef INFINITY
 205         if (double(INFINITY) > HUGE_VAL) return -INFINITY;
 206 #endif
 207         return -HUGE_VAL;
 208     }
 209
 210     unsigned char first = numfromstr(value, 0);
 211     size_t i = 0;
 212
 213     first ^= static_cast<unsigned char>(first & 0xc0) >> 1;
 214     bool negative = !(first & 0x80);
 215     bool exponent_negative = (first & 0x40);
 216     bool explen = !(first & 0x20);
 217     int exponent = first & 0x1f;
 218     if (!explen) {
 219         exponent >>= 2;
 220         if (negative ^ exponent_negative) exponent ^= 0x07;
 221     } else {
 222         first = numfromstr(value, ++i);
 223         exponent <<= 6;
 224         exponent |= (first >> 2);
 225         if (negative ^ exponent_negative) exponent ^= 0x07ff;
 226     }
 227
 228     unsigned word1;
 229     word1 = (unsigned(first & 0x03) << 24);
 230     word1 |= numfromstr(value, ++i) << 16;
 231     word1 |= numfromstr(value, ++i) << 8;
 232     word1 |= numfromstr(value, ++i);
 233
 234     unsigned word2 = 0;
 235     if (i < value.size()) {
 236         word2 = numfromstr(value, ++i) << 24;
 237         word2 |= numfromstr(value, ++i) << 16;
 238         word2 |= numfromstr(value, ++i) << 8;
 239         word2 |= numfromstr(value, ++i);
 240     }
 241
 242     if (negative) {
 243         word1 = -word1;
 244         if (word2 != 0) ++word1;
 245         word2 = -word2;
 246         UNITTEST_ASSERT_NOTHROW((word1 & 0xf0000000) != 0, 0);
 247         word1 &= 0x03ffffff;
 248     }
 249     if (!negative) word1 |= 1<<26;
 250
 251     double mantissa = 0;
 252     if (word2) mantissa = word2 / 4294967296.0; // 1<<32
 253     mantissa += word1;
 254     mantissa /= 1 << (negative ? 26 : 27);
 255
 256     if (exponent_negative) exponent = -exponent;
 257     exponent += 8;
 258
 259     if (negative) mantissa = -mantissa;
 260
 261     // We use scalbn() since it's equivalent to ldexp() when FLT_RADIX == 2
 262     // (which we currently assume), except that ldexp() will set errno if the
 263     // result overflows or underflows, which isn't really desirable here.
 264     return scalbn(mantissa, exponent);
 265 }