Fix integer type used by ESet
[xapian.git] / xapian-core / tests / api_spelling.cc
blob80f8c43bf3af11199ffe5d0c02f89ab379de92cf
1 /** @file
2 * @brief Test the spelling correction suggestion API.
3 */
4 /* Copyright (C) 2007-2021 Olly Betts
5 * Copyright (C) 2007 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
24 #include "api_spelling.h"
26 #include <xapian.h>
28 #include "apitest.h"
29 #include "testsuite.h"
30 #include "testutils.h"
32 #include <string>
34 using namespace std;
36 // Test add_spelling() and remove_spelling(), which remote dbs support.
37 DEFINE_TESTCASE(spell0, spelling || remote) {
38 Xapian::WritableDatabase db = get_writable_database();
40 db.add_spelling("hello");
41 db.add_spelling("cell", 2);
42 db.commit();
43 db.add_spelling("zig");
44 db.add_spelling("ch");
45 db.add_spelling("hello", 2);
46 db.remove_spelling("hello", 2);
47 db.remove_spelling("cell", 6);
48 db.commit();
49 db.remove_spelling("hello");
50 db.remove_spelling("nonsuch");
51 db.remove_spelling("zzzzzzzzz", 1000000);
52 db.remove_spelling("aarvark");
53 db.remove_spelling("hello");
54 db.commit();
55 db.remove_spelling("hello");
58 // Test basic spelling correction features.
59 DEFINE_TESTCASE(spell1, spelling) {
60 Xapian::WritableDatabase db = get_writable_database();
62 // Check that the more frequent term is chosen.
63 db.add_spelling("hello");
64 TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
65 db.add_spelling("cell", 2);
66 TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
67 db.commit();
68 Xapian::Database dbr(get_writable_database_as_database());
69 TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
70 TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "cell");
72 // Check suggestions for single edit errors to "zig".
73 db.add_spelling("zig");
74 // Transpositions:
75 TEST_EQUAL(db.get_spelling_suggestion("izg"), "zig");
76 TEST_EQUAL(db.get_spelling_suggestion("zgi"), "zig");
77 // Substitutions:
78 TEST_EQUAL(db.get_spelling_suggestion("sig"), "zig");
79 TEST_EQUAL(db.get_spelling_suggestion("zog"), "zig");
80 TEST_EQUAL(db.get_spelling_suggestion("zif"), "zig");
81 // Deletions:
82 TEST_EQUAL(db.get_spelling_suggestion("ig"), "zig");
83 TEST_EQUAL(db.get_spelling_suggestion("zg"), "zig");
84 TEST_EQUAL(db.get_spelling_suggestion("zi"), "zig");
85 // Insertions:
86 TEST_EQUAL(db.get_spelling_suggestion("azig"), "zig");
87 TEST_EQUAL(db.get_spelling_suggestion("zaig"), "zig");
88 TEST_EQUAL(db.get_spelling_suggestion("ziag"), "zig");
89 TEST_EQUAL(db.get_spelling_suggestion("ziga"), "zig");
91 // Check suggestions for single edit errors to "ch".
92 db.add_spelling("ch");
93 // Transpositions:
94 TEST_EQUAL(db.get_spelling_suggestion("hc"), "ch");
95 // Substitutions - we don't handle these for two character words:
96 TEST_EQUAL(db.get_spelling_suggestion("qh"), "");
97 TEST_EQUAL(db.get_spelling_suggestion("cq"), "");
98 // Deletions would leave a single character, and we don't handle those.
99 TEST_EQUAL(db.get_spelling_suggestion("c"), "");
100 TEST_EQUAL(db.get_spelling_suggestion("h"), "");
101 // Insertions:
102 TEST_EQUAL(db.get_spelling_suggestion("qch"), "ch");
103 TEST_EQUAL(db.get_spelling_suggestion("cqh"), "ch");
104 TEST_EQUAL(db.get_spelling_suggestion("chq"), "ch");
106 // Check assorted cases:
107 TEST_EQUAL(db.get_spelling_suggestion("shello"), "hello");
108 TEST_EQUAL(db.get_spelling_suggestion("hellot"), "hello");
109 TEST_EQUAL(db.get_spelling_suggestion("acell"), "cell");
110 TEST_EQUAL(db.get_spelling_suggestion("cella"), "cell");
111 TEST_EQUAL(db.get_spelling_suggestion("acella"), "cell");
112 TEST_EQUAL(db.get_spelling_suggestion("helo"), "hello");
113 TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
114 TEST_EQUAL(db.get_spelling_suggestion("helol"), "hello");
115 TEST_EQUAL(db.get_spelling_suggestion("clel"), "cell");
116 TEST_EQUAL(db.get_spelling_suggestion("ecll"), "cell");
117 TEST_EQUAL(db.get_spelling_suggestion("cll"), "cell");
119 // Check that edit distance 3 isn't found by default:
120 TEST_EQUAL(db.get_spelling_suggestion("shelolx"), "");
121 TEST_EQUAL(db.get_spelling_suggestion("celling"), "");
122 TEST_EQUAL(db.get_spelling_suggestion("dellin"), "");
124 // Check that edit distance 3 is found if specified:
125 TEST_EQUAL(db.get_spelling_suggestion("shelolx", 3), "hello");
126 TEST_EQUAL(db.get_spelling_suggestion("celling", 3), "cell");
127 TEST_EQUAL(db.get_spelling_suggestion("dellin", 3), "cell");
129 // Make "hello" more frequent than "cell" (3 vs 2).
130 db.add_spelling("hello", 2);
131 TEST_EQUAL(db.get_spelling_suggestion("hell"), "hello");
132 db.commit();
133 TEST_EQUAL(db.get_spelling_suggestion("cello"), "hello");
134 db.remove_spelling("hello", 2);
135 TEST_EQUAL(db.get_spelling_suggestion("hell"), "cell");
136 // Test "over-removing".
137 db.remove_spelling("cell", 6);
138 TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
139 db.commit();
140 TEST_EQUAL(db.get_spelling_suggestion("cell"), "hello");
141 db.remove_spelling("hello");
142 TEST_EQUAL(db.get_spelling_suggestion("cell"), "");
144 // Test removing words not in the table.
145 db.remove_spelling("nonsuch");
146 db.remove_spelling("zzzzzzzzz", 1000000);
147 db.remove_spelling("aarvark");
149 // Try removing word which was present but no longer is.
150 db.remove_spelling("hello");
151 db.commit();
152 db.remove_spelling("hello");
155 // Test spelling correction for Unicode.
156 DEFINE_TESTCASE(spell2, spelling) {
157 Xapian::WritableDatabase db = get_writable_database();
159 // Check that a UTF-8 sequence counts as a single character.
160 db.add_spelling("h\xc3\xb6hle");
161 db.add_spelling("ascii");
162 TEST_EQUAL(db.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
163 TEST_EQUAL(db.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
164 TEST_EQUAL(db.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
165 TEST_EQUAL(db.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
166 TEST_EQUAL(db.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
167 TEST_EQUAL(db.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
168 db.commit();
169 Xapian::Database dbr(get_writable_database_as_database());
170 TEST_EQUAL(dbr.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
171 TEST_EQUAL(dbr.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
172 TEST_EQUAL(dbr.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
173 TEST_EQUAL(dbr.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
174 TEST_EQUAL(dbr.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
175 TEST_EQUAL(dbr.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
178 // Test spelling correction with multi databases
179 DEFINE_TESTCASE(spell3, spelling) {
180 Xapian::WritableDatabase db1 = get_writable_database();
181 // We can't just call get_writable_database() since it would delete db1
182 // which doesn't work at all under __WIN32__ and will go wrong elsewhere if
183 // changes to db1 are committed.
184 Xapian::WritableDatabase db2 = get_named_writable_database("spell3", "");
186 db1.add_spelling("hello");
187 db1.add_spelling("cell", 2);
188 db2.add_spelling("hello", 2);
189 db2.add_spelling("helo");
191 Xapian::Database db;
192 db.add_database(db1);
193 db.add_database(db2);
195 TEST_EQUAL(db.get_spelling_suggestion("hello"), "");
196 TEST_EQUAL(db.get_spelling_suggestion("hell"), "hello");
197 TEST_EQUAL(db1.get_spelling_suggestion("hell"), "cell");
198 TEST_EQUAL(db2.get_spelling_suggestion("hell"), "hello");
200 // Test spelling iterator
201 Xapian::TermIterator i(db1.spellings_begin());
202 TEST_EQUAL(*i, "cell");
203 TEST_EQUAL(i.get_termfreq(), 2);
204 ++i;
205 TEST_EQUAL(*i, "hello");
206 TEST_EQUAL(i.get_termfreq(), 1);
207 ++i;
208 TEST(i == db1.spellings_end());
210 i = db2.spellings_begin();
211 TEST_EQUAL(*i, "hello");
212 TEST_EQUAL(i.get_termfreq(), 2);
213 ++i;
214 TEST_EQUAL(*i, "helo");
215 TEST_EQUAL(i.get_termfreq(), 1);
216 ++i;
217 TEST(i == db2.spellings_end());
219 i = db.spellings_begin();
220 TEST_EQUAL(*i, "cell");
221 TEST_EQUAL(i.get_termfreq(), 2);
222 ++i;
223 TEST_EQUAL(*i, "hello");
224 TEST_EQUAL(i.get_termfreq(), 3);
225 ++i;
226 TEST_EQUAL(*i, "helo");
227 TEST_EQUAL(i.get_termfreq(), 1);
228 ++i;
229 TEST(i == db.spellings_end());
231 // Regression test for TermIterator::skip_to() bug fixed in 1.4.19.
232 i = db.spellings_begin();
233 i.skip_to("helo");
234 TEST(i != db.spellings_end());
235 TEST_EQUAL(*i, "helo");
236 TEST_EQUAL(i.get_termfreq(), 1);
237 i.skip_to("help");
238 TEST(i == db.spellings_end());
241 // Regression test - check that appending works correctly.
242 DEFINE_TESTCASE(spell4, spelling) {
243 Xapian::WritableDatabase db = get_writable_database();
245 db.add_spelling("check");
246 db.add_spelling("pecks", 2);
247 db.commit();
248 db.add_spelling("becky");
249 db.commit();
251 TEST_EQUAL(db.get_spelling_suggestion("jeck", 2), "pecks");
254 // Regression test - used to segfault with some input values.
255 DEFINE_TESTCASE(spell5, spelling) {
256 const char * target = "\xe4\xb8\x80\xe4\xba\x9b";
258 Xapian::WritableDatabase db = get_writable_database();
259 db.add_spelling(target);
260 db.commit();
262 string s = db.get_spelling_suggestion("\xe4\xb8\x8d", 3);
263 TEST_EQUAL(s, target);
266 // Test basic spelling correction features.
267 DEFINE_TESTCASE(spell6, spelling) {
268 Xapian::WritableDatabase db = get_writable_database();
270 // Check that the more frequent term is chosen.
271 db.add_spelling("hello", 2);
272 db.add_spelling("sell", 3);
273 TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
274 db.commit();
275 Xapian::Database dbr(get_writable_database_as_database());
276 TEST_EQUAL(db.get_spelling_suggestion("hell"), "sell");
277 TEST_EQUAL(dbr.get_spelling_suggestion("hell"), "sell");
280 // Test suggestions when there's an exact match.
281 DEFINE_TESTCASE(spell7, spelling) {
282 Xapian::WritableDatabase db = get_writable_database();
284 // Check that the more frequent term is chosen.
285 db.add_spelling("word", 57);
286 db.add_spelling("wrod", 3);
287 db.add_spelling("sword", 56);
288 db.add_spelling("words", 57);
289 db.add_spelling("ward", 58);
290 db.commit();
291 TEST_EQUAL(db.get_spelling_suggestion("ward"), "");
292 TEST_EQUAL(db.get_spelling_suggestion("words"), "word");
293 TEST_EQUAL(db.get_spelling_suggestion("sword"), "word");
294 TEST_EQUAL(db.get_spelling_suggestion("wrod"), "word");
297 /// Regression test - repeated trigrams cancelled in 1.2.5 and earlier.
298 DEFINE_TESTCASE(spell8, spelling) {
299 Xapian::WritableDatabase db = get_writable_database();
301 // kin and kin used to cancel out in "skinking".
302 db.add_spelling("skinking", 2);
303 db.add_spelling("stinking", 1);
304 db.commit();
305 TEST_EQUAL(db.get_spelling_suggestion("scimkin", 3), "skinking");