Fix tg_termpos1 for 64-bit termpos
[xapian.git] / xapian-applications / omega / hashterm.cc
blob5210d0bf483c8bf077249bddc0e6139b3ed0289b
1 /** @file
2 * @brief generate a URL term, truncating and hashing very long URLs.
3 */
4 /* Copyright (C) 2003 Lemur Consulting Ltd.
5 * Copyright (C) 2003,2004,2006,2011,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
24 #include "hashterm.h"
26 #include <cassert>
28 using namespace std;
30 /* Hash is computed as an unsigned long, and then converted to a
31 * string by writing 6 bits of it to each output byte. So length is
32 * ceil(4 * 8 / 6) (we use 4 rather than sizeof(unsigned long) so
33 * that the hash is the same regardless of the platform).
35 const unsigned int HASH_LEN = ((4 * 8 + 5) / 6);
37 /* Make a hash of a string - this isn't a very good hashing algorithm, but
38 * it's fast. A collision would result in a document overwriting a different
39 * document, which is not desirable, but also wouldn't be a total disaster.
41 static string
42 hash_string(const string &s)
44 unsigned long int h = 1;
45 for (unsigned char ch : s) {
46 UNSIGNED_OVERFLOW_OK(h += (h << 5) + ch);
48 h &= 0xffffffff; // In case sizeof(unsigned long) > 4
49 // FIXME: It's quirky that we make leading zeros ' ' here, but "embedded"
50 // zeros become char(33) below. Not a problem, but perhaps change ' ' to
51 // char(33) if we need to break backwards compatibility for some other
52 // reason.
53 string result(HASH_LEN, ' ');
54 size_t j = 0;
55 while (h != 0) {
56 char ch = char((h & 63) + 33);
57 result[j++] = ch;
58 h = h >> 6;
60 return result;
63 string
64 hash_long_term(const string &term, unsigned int max_length)
66 assert(max_length >= HASH_LEN);
67 if (term.length() <= max_length) return term;
68 string result(term);
69 max_length -= HASH_LEN;
70 result.replace(max_length, string::npos,
71 hash_string(result.substr(max_length)));
72 return result;