2 * @brief generate a URL term, truncating and hashing very long URLs.
4 /* Copyright (C) 2003 Lemur Consulting Ltd.
5 * Copyright (C) 2003,2004,2006,2011,2023 Olly Betts
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
30 /* Hash is computed as an unsigned long, and then converted to a
31 * string by writing 6 bits of it to each output byte. So length is
32 * ceil(4 * 8 / 6) (we use 4 rather than sizeof(unsigned long) so
33 * that the hash is the same regardless of the platform).
35 const unsigned int HASH_LEN
= ((4 * 8 + 5) / 6);
37 /* Make a hash of a string - this isn't a very good hashing algorithm, but
38 * it's fast. A collision would result in a document overwriting a different
39 * document, which is not desirable, but also wouldn't be a total disaster.
42 hash_string(const string
&s
)
44 unsigned long int h
= 1;
45 for (unsigned char ch
: s
) {
46 UNSIGNED_OVERFLOW_OK(h
+= (h
<< 5) + ch
);
48 h
&= 0xffffffff; // In case sizeof(unsigned long) > 4
49 // FIXME: It's quirky that we make leading zeros ' ' here, but "embedded"
50 // zeros become char(33) below. Not a problem, but perhaps change ' ' to
51 // char(33) if we need to break backwards compatibility for some other
53 string
result(HASH_LEN
, ' ');
56 char ch
= char((h
& 63) + 33);
64 hash_long_term(const string
&term
, unsigned int max_length
)
66 assert(max_length
>= HASH_LEN
);
67 if (term
.length() <= max_length
) return term
;
69 max_length
-= HASH_LEN
;
70 result
.replace(max_length
, string::npos
,
71 hash_string(result
.substr(max_length
)));