2 * @brief Test the spelling correction suggestion API.
4 /* Copyright (C) 2007-2021 Olly Betts
5 * Copyright (C) 2007 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
24 #include "api_spelling.h"
29 #include "testsuite.h"
30 #include "testutils.h"
36 // Test add_spelling() and remove_spelling(), which remote dbs support.
37 DEFINE_TESTCASE(spell0
, spelling
|| remote
) {
38 Xapian::WritableDatabase db
= get_writable_database();
40 db
.add_spelling("hello");
41 db
.add_spelling("cell", 2);
43 db
.add_spelling("zig");
44 db
.add_spelling("ch");
45 db
.add_spelling("hello", 2);
46 db
.remove_spelling("hello", 2);
47 db
.remove_spelling("cell", 6);
49 db
.remove_spelling("hello");
50 db
.remove_spelling("nonsuch");
51 db
.remove_spelling("zzzzzzzzz", 1000000);
52 db
.remove_spelling("aarvark");
53 db
.remove_spelling("hello");
55 db
.remove_spelling("hello");
58 // Test basic spelling correction features.
59 DEFINE_TESTCASE(spell1
, spelling
) {
60 Xapian::WritableDatabase db
= get_writable_database();
62 // Check that the more frequent term is chosen.
63 db
.add_spelling("hello");
64 TEST_EQUAL(db
.get_spelling_suggestion("cell"), "hello");
65 db
.add_spelling("cell", 2);
66 TEST_EQUAL(db
.get_spelling_suggestion("hell"), "cell");
68 Xapian::Database
dbr(get_writable_database_as_database());
69 TEST_EQUAL(db
.get_spelling_suggestion("hell"), "cell");
70 TEST_EQUAL(dbr
.get_spelling_suggestion("hell"), "cell");
72 // Check suggestions for single edit errors to "zig".
73 db
.add_spelling("zig");
75 TEST_EQUAL(db
.get_spelling_suggestion("izg"), "zig");
76 TEST_EQUAL(db
.get_spelling_suggestion("zgi"), "zig");
78 TEST_EQUAL(db
.get_spelling_suggestion("sig"), "zig");
79 TEST_EQUAL(db
.get_spelling_suggestion("zog"), "zig");
80 TEST_EQUAL(db
.get_spelling_suggestion("zif"), "zig");
82 TEST_EQUAL(db
.get_spelling_suggestion("ig"), "zig");
83 TEST_EQUAL(db
.get_spelling_suggestion("zg"), "zig");
84 TEST_EQUAL(db
.get_spelling_suggestion("zi"), "zig");
86 TEST_EQUAL(db
.get_spelling_suggestion("azig"), "zig");
87 TEST_EQUAL(db
.get_spelling_suggestion("zaig"), "zig");
88 TEST_EQUAL(db
.get_spelling_suggestion("ziag"), "zig");
89 TEST_EQUAL(db
.get_spelling_suggestion("ziga"), "zig");
91 // Check suggestions for single edit errors to "ch".
92 db
.add_spelling("ch");
94 TEST_EQUAL(db
.get_spelling_suggestion("hc"), "ch");
95 // Substitutions - we don't handle these for two character words:
96 TEST_EQUAL(db
.get_spelling_suggestion("qh"), "");
97 TEST_EQUAL(db
.get_spelling_suggestion("cq"), "");
98 // Deletions would leave a single character, and we don't handle those.
99 TEST_EQUAL(db
.get_spelling_suggestion("c"), "");
100 TEST_EQUAL(db
.get_spelling_suggestion("h"), "");
102 TEST_EQUAL(db
.get_spelling_suggestion("qch"), "ch");
103 TEST_EQUAL(db
.get_spelling_suggestion("cqh"), "ch");
104 TEST_EQUAL(db
.get_spelling_suggestion("chq"), "ch");
106 // Check assorted cases:
107 TEST_EQUAL(db
.get_spelling_suggestion("shello"), "hello");
108 TEST_EQUAL(db
.get_spelling_suggestion("hellot"), "hello");
109 TEST_EQUAL(db
.get_spelling_suggestion("acell"), "cell");
110 TEST_EQUAL(db
.get_spelling_suggestion("cella"), "cell");
111 TEST_EQUAL(db
.get_spelling_suggestion("acella"), "cell");
112 TEST_EQUAL(db
.get_spelling_suggestion("helo"), "hello");
113 TEST_EQUAL(db
.get_spelling_suggestion("cll"), "cell");
114 TEST_EQUAL(db
.get_spelling_suggestion("helol"), "hello");
115 TEST_EQUAL(db
.get_spelling_suggestion("clel"), "cell");
116 TEST_EQUAL(db
.get_spelling_suggestion("ecll"), "cell");
117 TEST_EQUAL(db
.get_spelling_suggestion("cll"), "cell");
119 // Check that edit distance 3 isn't found by default:
120 TEST_EQUAL(db
.get_spelling_suggestion("shelolx"), "");
121 TEST_EQUAL(db
.get_spelling_suggestion("celling"), "");
122 TEST_EQUAL(db
.get_spelling_suggestion("dellin"), "");
124 // Check that edit distance 3 is found if specified:
125 TEST_EQUAL(db
.get_spelling_suggestion("shelolx", 3), "hello");
126 TEST_EQUAL(db
.get_spelling_suggestion("celling", 3), "cell");
127 TEST_EQUAL(db
.get_spelling_suggestion("dellin", 3), "cell");
129 // Make "hello" more frequent than "cell" (3 vs 2).
130 db
.add_spelling("hello", 2);
131 TEST_EQUAL(db
.get_spelling_suggestion("hell"), "hello");
133 TEST_EQUAL(db
.get_spelling_suggestion("cello"), "hello");
134 db
.remove_spelling("hello", 2);
135 TEST_EQUAL(db
.get_spelling_suggestion("hell"), "cell");
136 // Test "over-removing".
137 db
.remove_spelling("cell", 6);
138 TEST_EQUAL(db
.get_spelling_suggestion("cell"), "hello");
140 TEST_EQUAL(db
.get_spelling_suggestion("cell"), "hello");
141 db
.remove_spelling("hello");
142 TEST_EQUAL(db
.get_spelling_suggestion("cell"), "");
144 // Test removing words not in the table.
145 db
.remove_spelling("nonsuch");
146 db
.remove_spelling("zzzzzzzzz", 1000000);
147 db
.remove_spelling("aarvark");
149 // Try removing word which was present but no longer is.
150 db
.remove_spelling("hello");
152 db
.remove_spelling("hello");
155 // Test spelling correction for Unicode.
156 DEFINE_TESTCASE(spell2
, spelling
) {
157 Xapian::WritableDatabase db
= get_writable_database();
159 // Check that a UTF-8 sequence counts as a single character.
160 db
.add_spelling("h\xc3\xb6hle");
161 db
.add_spelling("ascii");
162 TEST_EQUAL(db
.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
163 TEST_EQUAL(db
.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
164 TEST_EQUAL(db
.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
165 TEST_EQUAL(db
.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
166 TEST_EQUAL(db
.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
167 TEST_EQUAL(db
.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
169 Xapian::Database
dbr(get_writable_database_as_database());
170 TEST_EQUAL(dbr
.get_spelling_suggestion("hohle", 1), "h\xc3\xb6hle");
171 TEST_EQUAL(dbr
.get_spelling_suggestion("hhle", 1), "h\xc3\xb6hle");
172 TEST_EQUAL(dbr
.get_spelling_suggestion("\xf0\xa8\xa8\x8f\xc3\xb6le", 2), "h\xc3\xb6hle");
173 TEST_EQUAL(dbr
.get_spelling_suggestion("hh\xc3\xb6l"), "h\xc3\xb6hle");
174 TEST_EQUAL(dbr
.get_spelling_suggestion("as\xc3\xb6\xc3\xb7i"), "ascii");
175 TEST_EQUAL(dbr
.get_spelling_suggestion("asc\xc3\xb6i\xc3\xb7i"), "ascii");
178 // Test spelling correction with multi databases
179 DEFINE_TESTCASE(spell3
, spelling
) {
180 Xapian::WritableDatabase db1
= get_writable_database();
181 // We can't just call get_writable_database() since it would delete db1
182 // which doesn't work at all under __WIN32__ and will go wrong elsewhere if
183 // changes to db1 are committed.
184 Xapian::WritableDatabase db2
= get_named_writable_database("spell3", "");
186 db1
.add_spelling("hello");
187 db1
.add_spelling("cell", 2);
188 db2
.add_spelling("hello", 2);
189 db2
.add_spelling("helo");
192 db
.add_database(db1
);
193 db
.add_database(db2
);
195 TEST_EQUAL(db
.get_spelling_suggestion("hello"), "");
196 TEST_EQUAL(db
.get_spelling_suggestion("hell"), "hello");
197 TEST_EQUAL(db1
.get_spelling_suggestion("hell"), "cell");
198 TEST_EQUAL(db2
.get_spelling_suggestion("hell"), "hello");
200 // Test spelling iterator
201 Xapian::TermIterator
i(db1
.spellings_begin());
202 TEST_EQUAL(*i
, "cell");
203 TEST_EQUAL(i
.get_termfreq(), 2);
205 TEST_EQUAL(*i
, "hello");
206 TEST_EQUAL(i
.get_termfreq(), 1);
208 TEST(i
== db1
.spellings_end());
210 i
= db2
.spellings_begin();
211 TEST_EQUAL(*i
, "hello");
212 TEST_EQUAL(i
.get_termfreq(), 2);
214 TEST_EQUAL(*i
, "helo");
215 TEST_EQUAL(i
.get_termfreq(), 1);
217 TEST(i
== db2
.spellings_end());
219 i
= db
.spellings_begin();
220 TEST_EQUAL(*i
, "cell");
221 TEST_EQUAL(i
.get_termfreq(), 2);
223 TEST_EQUAL(*i
, "hello");
224 TEST_EQUAL(i
.get_termfreq(), 3);
226 TEST_EQUAL(*i
, "helo");
227 TEST_EQUAL(i
.get_termfreq(), 1);
229 TEST(i
== db
.spellings_end());
231 // Regression test for TermIterator::skip_to() bug fixed in 1.4.19.
232 i
= db
.spellings_begin();
234 TEST(i
!= db
.spellings_end());
235 TEST_EQUAL(*i
, "helo");
236 TEST_EQUAL(i
.get_termfreq(), 1);
238 TEST(i
== db
.spellings_end());
241 // Regression test - check that appending works correctly.
242 DEFINE_TESTCASE(spell4
, spelling
) {
243 Xapian::WritableDatabase db
= get_writable_database();
245 db
.add_spelling("check");
246 db
.add_spelling("pecks", 2);
248 db
.add_spelling("becky");
251 TEST_EQUAL(db
.get_spelling_suggestion("jeck", 2), "pecks");
254 // Regression test - used to segfault with some input values.
255 DEFINE_TESTCASE(spell5
, spelling
) {
256 const char * target
= "\xe4\xb8\x80\xe4\xba\x9b";
258 Xapian::WritableDatabase db
= get_writable_database();
259 db
.add_spelling(target
);
262 string s
= db
.get_spelling_suggestion("\xe4\xb8\x8d", 3);
263 TEST_EQUAL(s
, target
);
266 // Test basic spelling correction features.
267 DEFINE_TESTCASE(spell6
, spelling
) {
268 Xapian::WritableDatabase db
= get_writable_database();
270 // Check that the more frequent term is chosen.
271 db
.add_spelling("hello", 2);
272 db
.add_spelling("sell", 3);
273 TEST_EQUAL(db
.get_spelling_suggestion("hell"), "sell");
275 Xapian::Database
dbr(get_writable_database_as_database());
276 TEST_EQUAL(db
.get_spelling_suggestion("hell"), "sell");
277 TEST_EQUAL(dbr
.get_spelling_suggestion("hell"), "sell");
280 // Test suggestions when there's an exact match.
281 DEFINE_TESTCASE(spell7
, spelling
) {
282 Xapian::WritableDatabase db
= get_writable_database();
284 // Check that the more frequent term is chosen.
285 db
.add_spelling("word", 57);
286 db
.add_spelling("wrod", 3);
287 db
.add_spelling("sword", 56);
288 db
.add_spelling("words", 57);
289 db
.add_spelling("ward", 58);
291 TEST_EQUAL(db
.get_spelling_suggestion("ward"), "");
292 TEST_EQUAL(db
.get_spelling_suggestion("words"), "word");
293 TEST_EQUAL(db
.get_spelling_suggestion("sword"), "word");
294 TEST_EQUAL(db
.get_spelling_suggestion("wrod"), "word");
297 /// Regression test - repeated trigrams cancelled in 1.2.5 and earlier.
298 DEFINE_TESTCASE(spell8
, spelling
) {
299 Xapian::WritableDatabase db
= get_writable_database();
301 // kin and kin used to cancel out in "skinking".
302 db
.add_spelling("skinking", 2);
303 db
.add_spelling("stinking", 1);
305 TEST_EQUAL(db
.get_spelling_suggestion("scimkin", 3), "skinking");