2 # some of the comments match the comments in MogileFS/Worker/Fsck.pm
3 # _exactly_ for reference purposes
8 use Time::HiRes qw(sleep);
12 find_mogclient_or_skip();
15 my $sto = eval { temp_store(); };
17 plan skip_all => "Can't create temporary test database: $@";
23 $mogroot{1} = File::Temp::tempdir( CLEANUP => 1 );
24 $mogroot{2} = File::Temp::tempdir( CLEANUP => 1 );
25 $mogroot{3} = File::Temp::tempdir( CLEANUP => 1 );
26 my $dev2host = { 1 => 1, 2 => 2, 3 => 2 };
27 foreach (sort { $a <=> $b } keys %$dev2host) {
28 my $root = $mogroot{$dev2host->{$_}};
29 mkdir("$root/dev$_") or die "Failed to create dev$_ dir: $!";
32 my $ms1 = create_mogstored("127.0.1.1", $mogroot{1});
33 ok($ms1, "got mogstored1");
34 my $ms2 = create_mogstored("127.0.1.2", $mogroot{2});
35 ok($ms1, "got mogstored2");
37 while (! -e "$mogroot{1}/dev1/usage" ||
38 ! -e "$mogroot{2}/dev2/usage" ||
39 ! -e "$mogroot{2}/dev3/usage") {
40 print "Waiting on usage...\n";
44 my $tmptrack = create_temp_tracker($sto);
47 my $admin = IO::Socket::INET->new(PeerAddr => '127.0.0.1:7001');
48 $admin or die "failed to create admin socket: $!";
49 my $moga = MogileFS::Admin->new(hosts => [ "127.0.0.1:7001" ]);
50 my $mogc = MogileFS::Client->new(
52 hosts => [ "127.0.0.1:7001" ],
54 my $be = $mogc->{backend}; # gross, reaching inside of MogileFS::Client
56 # test some basic commands to backend
57 ok($tmptrack->mogadm("domain", "add", "testdom"), "created test domain");
58 ok($tmptrack->mogadm("class", "add", "testdom", "2copies", "--mindevcount=2"), "created 2copies class in testdom");
59 ok($tmptrack->mogadm("class", "add", "testdom", "1copy", "--mindevcount=1"), "created 1copy class in testdom");
61 ok($tmptrack->mogadm("host", "add", "hostA", "--ip=127.0.1.1", "--status=alive"), "created hostA");
62 ok($tmptrack->mogadm("host", "add", "hostB", "--ip=127.0.1.2", "--status=alive"), "created hostB");
64 ok($tmptrack->mogadm("device", "add", "hostA", 1), "created dev1 on hostA");
65 ok($tmptrack->mogadm("device", "add", "hostB", 2), "created dev2 on hostB");
67 sub wait_for_monitor {
69 my $was = $be->{timeout}; # can't use local on phash :(
71 ok($be->do_request("clear_cache", {}), "waited for monitor")
72 or die "Failed to wait for monitor";
73 ok($be->do_request("clear_cache", {}), "waited for monitor")
74 or die "Failed to wait for monitor";
75 $be->{timeout} = $was;
78 sub wait_for_empty_queue {
79 my ($table, $dbh) = @_;
85 $count = $dbh->selectrow_array("SELECT COUNT(*) from $table");
86 return if ($count == 0);
89 my $time = $delay * $limit;
90 die "$table is not empty after ${time}s!";
94 my ($tmptrack, $dbh) = @_;
96 # this should help prevent race conditions:
97 wait_for_empty_queue("file_to_queue", $dbh);
99 ok($tmptrack->mogadm("fsck", "stop"), "stop fsck");
100 ok($tmptrack->mogadm("fsck", "clearlog"), "clear fsck log");
101 ok($tmptrack->mogadm("fsck", "reset"), "reset fsck");
102 ok($tmptrack->mogadm("fsck", "start"), "started fsck");
105 wait_for_monitor($be);
107 my ($req, $rv, %opts, @paths, @fsck_log, $info);
108 my $ua = LWP::UserAgent->new;
114 # upload a file and wait for replica to appear
116 my $fh = $mogc->new_file($key, "1copy");
118 ok(close($fh), "closed file");
121 # first obvious fucked-up case: no devids even presumed to exist.
123 $info = $mogc->file_info($key);
124 is($info->{devcount}, 1, "ensure devcount is correct at start");
126 # ensure repl queue is empty before destroying file_on
127 wait_for_empty_queue("file_to_replicate", $dbh);
129 my @devids = $sto->fid_devids($info->{fid});
130 is(scalar(@devids), 1, "devids retrieved");
131 is($sto->remove_fidid_from_devid($info->{fid}, $devids[0]), 1,
132 "delete $key from file_on table");
134 full_fsck($tmptrack, $dbh);
136 @fsck_log = $sto->fsck_log_rows;
137 } while (scalar(@fsck_log) < 3 && sleep(0.1));
139 wait_for_empty_queue("file_to_queue", $dbh);
140 @fsck_log = $sto->fsck_log_rows;
142 my $nopa = $fsck_log[0];
143 is($nopa->{evcode}, "NOPA", "evcode for no paths logged");
145 # entering "desperate" mode
146 my $srch = $fsck_log[1];
147 is($srch->{evcode}, "SRCH", "evcode for start search logged");
149 # wow, we actually found it!
150 my $fond = $fsck_log[2];
151 is($fond->{evcode}, "FOND", "evcode for start search logged");
153 $info = $mogc->file_info($key);
154 is($info->{devcount}, 1, "ensure devcount is correct at fsck end");
155 @paths = $mogc->get_paths($key);
156 is(scalar(@paths), 1, "get_paths returns correctly at fsck end");
159 # update class to require 2copies and have fsck fix it
161 @paths = $mogc->get_paths($key);
162 is(scalar(@paths), 1, "only one path exists before fsck");
164 # _NOT_ using "updateclass" command since that enqueues for replication
165 my $fid = MogileFS::FID->new($info->{fid});
166 my $classid_2copies = $sto->get_classid_by_name($fid->dmid, "2copies");
167 is($fid->update_class(classid => $classid_2copies), 1, "classid updated");
169 full_fsck($tmptrack, $dbh);
172 @paths = $mogc->get_paths($key);
173 } while (scalar(@paths) == 1 and sleep(0.1));
174 is(scalar(@paths), 2, "replicated from fsck");
176 $info = $mogc->file_info($key);
177 is($info->{devcount}, 2, "ensure devcount is updated by replicate");
180 @fsck_log = $sto->fsck_log_rows;
181 } while (scalar(@fsck_log) == 0 and sleep(0.1));
183 my $povi = $fsck_log[0];
184 is($povi->{evcode}, "POVI", "policy violation logged by fsck");
186 my $repl = $fsck_log[1];
187 is($repl->{evcode}, "REPL", "replication request logged by fsck");
190 # wrong devcount in file column, but otherwise everything is OK
192 foreach my $wrong_devcount (13, 0, 1) {
193 is($dbh->do("UPDATE file SET devcount = ? WHERE fid = ?", undef, $wrong_devcount, $info->{fid}), 1, "set improper devcount");
195 $info = $mogc->file_info($key);
196 is($info->{devcount}, $wrong_devcount, "devcount is set to $wrong_devcount");
198 full_fsck($tmptrack, $dbh);
201 $info = $mogc->file_info($key);
202 } while ($info->{devcount} == $wrong_devcount && sleep(0.1));
203 is($info->{devcount}, 2, "devcount is corrected by fsck");
205 # XXX POVI gets logged here, but BCNT might be more correct...
206 wait_for_empty_queue("file_to_queue", $dbh);
207 @fsck_log = $sto->fsck_log_rows;
208 is($fsck_log[0]->{evcode}, "POVI", "policy violation logged");
212 # nuke a file from disk but keep the file_on row
214 @paths = $mogc->get_paths($key);
215 is(scalar(@paths), 2, "two paths returned from get_paths");
216 $req = HTTP::Request->new(DELETE => $paths[0]);
217 $rv = $ua->request($req);
218 ok($rv->is_success, "DELETE successful");
220 full_fsck($tmptrack, $dbh);
222 @fsck_log = $sto->fsck_log_rows;
223 } while (scalar(@fsck_log) < 2 && sleep(0.1));
225 my $miss = $fsck_log[0];
226 is($miss->{evcode}, "MISS", "missing file logged by fsck");
228 my $repl = $fsck_log[1];
229 is($repl->{evcode}, "REPL", "replication request logged by fsck");
231 wait_for_empty_queue("file_to_replicate", $dbh);
233 @paths = $mogc->get_paths($key);
234 is(scalar(@paths), 2, "two paths returned from get_paths");
235 foreach my $path (@paths) {
236 $rv = $ua->get($path);
237 is($rv->content, "hello\n", "GET successful on restored path");
241 # change the length of a file from disk and have fsck correct it
243 @paths = $mogc->get_paths($key);
244 is(scalar(@paths), 2, "two paths returned from get_paths");
245 $req = HTTP::Request->new(PUT => $paths[0]);
246 $req->content("hello\r\n");
247 $rv = $ua->request($req);
248 ok($rv->is_success, "PUT successful");
250 full_fsck($tmptrack, $dbh);
252 @fsck_log = $sto->fsck_log_rows;
253 } while (scalar(@fsck_log) < 2 && sleep(0.1));
255 my $blen = $fsck_log[0];
256 is($blen->{evcode}, "BLEN", "missing file logged by fsck");
258 my $repl = $fsck_log[1];
259 is($repl->{evcode}, "REPL", "replication request logged by fsck");
261 wait_for_empty_queue("file_to_replicate", $dbh);
263 @paths = $mogc->get_paths($key);
264 is(scalar(@paths), 2, "two paths returned from get_paths");
265 foreach my $path (@paths) {
266 $rv = $ua->get($path);
267 is($rv->content, "hello\n", "GET successful on restored path");
271 # nuke a file completely and irreparably
273 @paths = $mogc->get_paths($key);
274 is(scalar(@paths), 2, "two paths returned from get_paths");
275 foreach my $path (@paths) {
276 $req = HTTP::Request->new(DELETE => $path);
277 $rv = $ua->request($req);
278 ok($rv->is_success, "DELETE successful");
281 full_fsck($tmptrack, $dbh);
283 @fsck_log = $sto->fsck_log_rows;
284 } while (scalar(@fsck_log) < 4 && sleep(0.1));
286 is($fsck_log[0]->{evcode}, "MISS", "missing file logged for first path");
287 is($fsck_log[1]->{evcode}, "MISS", "missing file logged for second path");
288 is($fsck_log[2]->{evcode}, "SRCH", "desperate search attempt logged");
289 is($fsck_log[3]->{evcode}, "GONE", "inability to fix FID logged");
291 wait_for_empty_queue("file_to_queue", $dbh);
292 $info = $mogc->file_info($key);
294 # XXX devcount probably needs to be updated on GONE
295 # is($info->{devcount}, 2, "devcount updated to zero");
296 @paths = $mogc->get_paths($key);
297 is(scalar(@paths), 0, "get_paths returns nothing");
300 # upload a file and wait for replica to appear
302 my $fh = $mogc->new_file($key, "2copies");
304 ok(close($fh), "closed file");
307 $info = $mogc->file_info($key);
308 } while ($info->{devcount} < 2);
309 is($info->{devcount}, 2, "ensure devcount is correct at start");
312 # stall fsck with a non-responsive host
314 is(kill("STOP", $ms1->pid), 1, "send SIGSTOP to mogstored1");
315 wait_for_monitor($be);
318 ok($tmptrack->mogadm("fsck", "start"), "started fsck, sleeping ${delay}s");
320 is($sto->file_queue_length(MogileFS::Config::FSCK_QUEUE), 1, "fsck queue still blocked");
323 # resume fsck when host is responsive again
325 is(kill("CONT", $ms1->pid), 1, "send SIGCONT to mogstored1");
326 wait_for_monitor($be);
328 # force fsck to wakeup and do work again
329 my $now = $sto->unix_timestamp;
330 is($sto->dbh->do("UPDATE file_to_queue SET nexttry = $now"), 1, "unblocked fsck queue");
331 ok($admin->syswrite("!to fsck :wake_up\n"), "force fsck to wake up");
332 ok($admin->getline, "got wakeup response 1");
333 ok($admin->getline, "got wakeup response 2");
335 foreach my $i (1..30) {
336 last if $sto->file_queue_length(MogileFS::Config::FSCK_QUEUE) == 0;
341 is($sto->file_queue_length(MogileFS::Config::FSCK_QUEUE), 0, "fsck queue emptied");
344 # upload a file and wait for replica to appear
346 my $fh = $mogc->new_file($key, "2copies");
348 ok(close($fh), "closed file");
351 $info = $mogc->file_info($key);
352 } while ($info->{devcount} < 2);
353 is($info->{devcount}, 2, "ensure devcount is correct at start");
356 # stall fsck with a non-responsive host
357 # resume fsck when host is responsive again
359 is(kill("STOP", $ms1->pid), 1, "send SIGSTOP to mogstored1 to stall");
360 wait_for_monitor($be);
363 ok($tmptrack->mogadm("fsck", "start"), "started fsck, sleeping ${delay}s");
365 is($sto->file_queue_length(MogileFS::Config::FSCK_QUEUE), 1, "fsck queue still blocked");
367 is(kill("CONT", $ms1->pid), 1, "send SIGCONT to mogstored1 to resume");
368 wait_for_monitor($be);
370 # force fsck to wakeup and do work again
371 my $now = $sto->unix_timestamp;
372 is($sto->dbh->do("UPDATE file_to_queue SET nexttry = $now"), 1, "unblocked fsck queue");
373 ok($admin->syswrite("!to fsck :wake_up\n"), "force fsck to wake up");
374 ok($admin->getline, "got wakeup response 1");
375 ok($admin->getline, "got wakeup response 2");
377 foreach my $i (1..30) {
378 last if $sto->file_queue_length(MogileFS::Config::FSCK_QUEUE) == 0;
383 is($sto->file_queue_length(MogileFS::Config::FSCK_QUEUE), 0, "fsck queue emptied");
386 # cleanup over-replicated file
388 $info = $mogc->file_info($key);
389 is($info->{devcount}, 2, "ensure devcount is correct at start");
391 # _NOT_ using "updateclass" command since that enqueues for replication
392 my $fid = MogileFS::FID->new($info->{fid});
393 my $classid_1copy = $sto->get_classid_by_name($fid->dmid, "1copy");
394 is($fid->update_class(classid => $classid_1copy), 1, "classid updated");
396 full_fsck($tmptrack, $dbh);
398 sleep(1) while $mogc->file_info($key)->{devcount} == 2;
399 is($mogc->file_info($key)->{devcount}, 1, "too-happy file cleaned up");
401 @fsck_log = $sto->fsck_log_rows;
402 is($fsck_log[0]->{evcode}, "POVI", "policy violation logged");
404 # replicator is requested to delete too-happy file
405 is($fsck_log[1]->{evcode}, "REPL", "replication request logged");
408 # kill a device and replace it with another, without help from reaper
409 # this test may become impossible if monitor + reaper are merged...
411 ok($mogc->update_class($key, "2copies"), "request 2 replicas again");
413 $info = $mogc->file_info($key);
414 } while ($info->{devcount} < 2);
415 is($info->{devcount}, 2, "ensure devcount is correct at start");
416 wait_for_empty_queue("file_to_replicate", $dbh);
418 $admin->syswrite("!jobs\n");
421 while (my $line = $admin->getline) {
422 last if $line =~ /^\.\r?\n/;
424 if ($line =~ /^reaper pids (\d+)\r?\n/) {
428 ok($reaper_pid, "got pid of reaper");
430 # reaper is watchdog is 240s, and it wakes up in 5s intervals right now
431 # so we should be safe from watchdog for now...
432 ok(kill("STOP", $reaper_pid), "stopped reaper from running");
434 ok($tmptrack->mogadm("device", "mark", "hostB", 2, "dead"), "mark dev2 as dead");
435 ok($tmptrack->mogadm("device", "add", "hostB", 3), "created dev3 on hostB");
436 wait_for_monitor($be);
438 full_fsck($tmptrack, $dbh);
440 sleep 1 while MogileFS::Config->server_setting("fsck_host");
442 foreach my $i (1..30) {
443 last if $sto->file_queue_length(MogileFS::Config::FSCK_QUEUE) == 0;
446 is($sto->file_queue_length(MogileFS::Config::FSCK_QUEUE), 0, "fsck queue empty");
448 # fsck should've corrected what reaper missed
449 @fsck_log = $sto->fsck_log_rows;
450 is(scalar(@fsck_log), 1, "fsck log populated");
451 is($fsck_log[0]->{evcode}, "REPL", "replication enqueued");
453 ok(kill("CONT", $reaper_pid), "resumed reaper");