1 package MogileFS
::Worker
::Delete
;
5 use base
'MogileFS::Worker';
6 use MogileFS
::Util
qw(error);
8 # we select 1000 but only do a random 100 of them, to allow
9 # for stateless parallelism
10 use constant LIMIT
=> 1000;
11 use constant PER_BATCH
=> 100;
13 # TODO: use LWP and persistent connections to do deletes. less local ports used.
16 my ($class, $psock) = @_;
17 my $self = fields
::new
($class);
18 $self->SUPER::new
($psock);
23 sub watchdog_timeout
{ 120 }
28 my $sleep_for = 0; # we sleep longer and longer until we hit max_sleep
29 my $sleep_max = 5; # max sleep when there's nothing to do.
31 my $old_queue_check = 0; # next time to check the old queue.
32 my $old_queue_backoff = 0; # backoff index
34 # wait for one pass of the monitor
35 $self->wait_for_monitor;
38 $self->send_to_parent("worker_bored 50 delete");
39 $self->read_from_parent(1);
42 # call our workers, and have them do things
43 # RETVAL = 0; I think I am done working for now
44 # RETVAL = 1; I have more work to do
45 my $tempres = $self->process_tempfiles;
47 if (time() > $old_queue_check) {
48 $self->reenqueue_delayed_deletes;
49 $delres = $self->process_deletes;
50 # if we did no work, crawl the backoff.
52 $old_queue_backoff = 0;
55 $old_queue_check = time() + $old_queue_backoff
56 if $old_queue_backoff > 360;
57 $old_queue_backoff++ unless $old_queue_backoff > 1800;
61 my $delres2 = $self->process_deletes2;
63 # unless someone did some work, let's sleep
64 unless ($tempres || $delres || $delres2) {
65 $sleep_for++ if $sleep_for < $sleep_max;
74 sub process_tempfiles
{
76 # also clean the tempfile table
77 #mysql> select * from tempfile where createtime < unix_timestamp() - 86400 limit 50;
78 #+--------+------------+---------+------+---------+--------+
79 #| fid | createtime | classid | dmid | dkey | devids |
80 #+--------+------------+---------+------+---------+--------+
81 #| 3253 | 1149451058 | 1 | 1 | file574 | 1,2 |
82 #| 4559 | 1149451156 | 1 | 1 | file83 | 1,2 |
83 #| 11024 | 1149451697 | 1 | 1 | file836 | 2,1 |
84 #| 19885 | 1149454542 | 1 | 1 | file531 | 1,2 |
87 # the fids might exist on one of the devices in devids column if we assigned them those,
88 # they wrote some to one of them, then they died or for whatever reason didn't create_close
89 # to use, so we shouldn't delete from tempfile before going on a hunt of the missing fid.
90 # perhaps we should just add to the file_on table for both devids, and let the regular delete
91 # process discover via 404 that they're not there.
93 # select fid, devids from tempfile where createtime < unix_timestamp() - 86400
94 # add file_on rows for both of those,
95 # add fid to fids_to_delete table,
96 # delete from tempfile where fid=?
99 # dig up some temporary files to purge
100 my $sto = Mgd
::get_store
();
101 my $too_old = int($ENV{T_TEMPFILE_TOO_OLD
} || 3600);
102 my $tempfiles = $sto->old_tempfiles($too_old);
103 return 0 unless $tempfiles && @
$tempfiles;
105 # insert the right rows into file_on and file_to_delete and remove the
106 # now expunged (or soon to be) rows from tempfile
107 my (@devfids, @fidids);
108 foreach my $row (@
$tempfiles) {
110 # If FID is still loadable, we've arrived here due to a bug or race
111 # condition elsewhere. Remove the tempfile row but don't delete the
113 my $fidid = $row->[0];
114 my $fid = MogileFS
::FID
->new($fidid);
116 $sto->delete_tempfile_row($fidid);
119 push @fidids, $fidid;
121 # sanity check the string column.
122 my $devids = $row->[1];
123 unless ($devids =~ /^(\d+)(,\d+)*$/) {
127 foreach my $devid (split /,/, $devids) {
128 push @devfids, MogileFS
::DevFID
->new($devid, $row->[0]);
132 # We might've done no work due to discovering the tempfiles are real.
133 return 0 unless @fidids;
135 $sto->mass_insert_file_on(@devfids);
136 $sto->enqueue_fids_to_delete2(@fidids);
137 $sto->dbh->do("DELETE FROM tempfile WHERE fid IN (" . join(',', @fidids) . ")");
141 # new style delete queueing. I'm not putting a lot of effort into commonizing
142 # code between the old one and the new one. Feel free to send a patch!
143 sub process_deletes2
{
146 my $sto = Mgd
::get_store
();
148 my $queue_todo = $self->queue_todo('delete');
149 unless (@
$queue_todo) {
154 while (my $todo = shift @
$queue_todo) {
157 # load all the devids related to this fid, and delete.
158 my $fid = MogileFS
::FID
->new($todo->{fid
});
159 my $fidid = $fid->id;
160 my @devids = $fid->devids;
161 my %devids = map { $_ => 1 } @devids;
164 for my $devid (@devids) {
165 my $dev = $devid ? Mgd
::device_factory
()->get_by_id($devid) : undef;
166 error
("deleting fid $fidid, on devid ".($devid || 'NULL')."...") if $Mgd::DEBUG
>= 2;
170 if ($dev->dstate->is_perm_dead) {
171 $sto->remove_fidid_from_devid($fidid, $devid);
172 delete $devids{$devid};
175 # devid is observed down/readonly: delay for at least
177 unless ($dev->observed_writeable) {
178 $sto->reschedule_file_to_delete2_relative($fidid,
179 60 * (10 + $todo->{failcount
}));
182 # devid is marked readonly/down/etc: delay for
184 unless ($dev->can_delete_from) {
185 $sto->reschedule_file_to_delete2_relative($fidid,
186 60 * 60 * (1 + $todo->{failcount
}));
190 my $dfid = MogileFS
::DevFID
->new($dev, $fidid);
191 my $path = $dfid->url;
193 # dormando: "There are cases where url can return undefined,
194 # Mogile appears to try to replicate to bogus devices
197 error
("in deleter, url(devid=$devid, fid=$fidid) returned nothing");
201 my $urlparts = MogileFS
::Util
::url_parts
($path);
203 # hit up the server and delete it
204 # TODO: (optimization) use MogileFS->get_observed_state and don't
205 # try to delete things known to be down/etc
206 my $sock = IO
::Socket
::INET
->new(PeerAddr
=> $urlparts->[0],
207 PeerPort
=> $urlparts->[1],
209 # this used to mark the device as down for the whole tracker.
210 # if the device is actually down, we can struggle until the
211 # monitor job figures it out... otherwise an occasional timeout
212 # due to high load will prevent delete from working at all.
214 $sto->reschedule_file_to_delete2_relative($fidid,
215 60 * 60 * (1 + $todo->{failcount
}));
219 # send delete request
220 error
("Sending delete for $path") if $Mgd::DEBUG
>= 2;
222 $sock->write("DELETE $urlparts->[2] HTTP/1.0\r\n\r\n");
223 my $response = <$sock>;
224 if ($response =~ m!^HTTP/\d+\.\d+\s+(\d+)!) {
225 if (($1 >= 200 && $1 <= 299) || $1 == 404) {
226 # effectively means all went well
227 $sto->remove_fidid_from_devid($fidid, $devid);
228 delete $devids{$devid};
230 # remote file system error? mark node as down
232 error
("Error: unlink failure: $path: HTTP code $httpcode");
234 $sto->reschedule_file_to_delete2_relative($fidid,
235 60 * 30 * (1 + $todo->{failcount
}));
239 error
("Error: unknown response line deleting $path: $response");
244 unless (keys %devids) {
245 $sto->delete_fid_from_file_to_delete2($fidid);
254 sub process_deletes
{
257 my $sto = Mgd
::get_store
();
260 my $delmap = $dbh->selectall_arrayref("SELECT fd.fid, fo.devid ".
261 "FROM file_to_delete fd ".
262 "LEFT JOIN file_on fo ON fd.fid=fo.fid ".
264 my $count = $delmap ?
scalar @
$delmap : 0;
265 return 0 unless $count;
268 foreach my $dm (List
::Util
::shuffle
(@
$delmap)) {
269 last if ++$done > PER_BATCH
;
272 my ($fid, $devid) = @
$dm;
273 error
("deleting fid $fid, on devid ".($devid || 'NULL')."...") if $Mgd::DEBUG
>= 2;
275 my $done_with_fid = sub {
277 $dbh->do("DELETE FROM file_to_delete WHERE fid=?", undef, $fid);
278 $sto->condthrow("Failure to delete from file_to_delete for fid=$fid");
281 my $done_with_devid = sub {
283 $dbh->do("DELETE FROM file_on WHERE fid=? AND devid=?",
284 undef, $fid, $devid);
285 $sto->condthrow("Failure to delete from file_on for $fid/$devid");
286 die "Failed to delete from file_on: " . $dbh->errstr if $dbh->err;
289 my $reschedule_fid = sub {
290 my ($secs, $reason) = (int(shift), shift);
291 $sto->insert_ignore("INTO file_to_delete_later (fid, delafter) ".
292 "VALUES (?,".$sto->unix_timestamp."+$secs)", undef,
294 error
("delete of fid $fid rescheduled: $reason") if $Mgd::DEBUG
>= 2;
295 $done_with_fid->("rescheduled");
299 # devid is null: doesn't exist anywhere anymore, we're done with this fid.
300 # devid is observed down/readonly: delay for 10 minutes
301 # devid is marked readonly: delay for 2 hours
302 # devid is marked dead or doesn't exist: consider it deleted on this devid.
304 # CASE: devid is null, which means we're done deleting all instances.
305 unless (defined $devid) {
306 $done_with_fid->("no_more_locations");
310 # CASE: devid is marked dead or doesn't exist: consider it deleted on this devid.
311 # (Note: we're tolerant of '0' as a devid, due to old buggy version which
312 # would sometimes put that in there)
313 my $dev = $devid ? Mgd
::device_factory
()->get_by_id($devid) : undef;
314 unless ($dev && $dev->exists) {
315 $done_with_devid->("devid_doesnt_exist");
318 if ($dev->dstate->is_perm_dead) {
319 $done_with_devid->("devid_marked_dead");
323 # CASE: devid is observed down/readonly: delay for 10 minutes
324 unless ($dev->observed_writeable) {
325 $reschedule_fid->(60 * 10, "not_observed_writeable");
329 # CASE: devid is marked readonly/down/etc: delay for 2 hours
330 unless ($dev->can_delete_from) {
331 $reschedule_fid->(60 * 60 * 2, "devid_marked_not_alive");
335 my $dfid = MogileFS
::DevFID
->new($dev, $fid);
336 my $path = $dfid->url;
338 # dormando: "There are cases where url can return undefined,
339 # Mogile appears to try to replicate to bogus devices
342 error
("in deleter, url(devid=$devid, fid=$fid) returned nothing");
346 my $urlparts = MogileFS
::Util
::url_parts
($path);
348 # hit up the server and delete it
349 # TODO: (optimization) use MogileFS->get_observed_state and don't try to delete things known to be down/etc
350 my $sock = IO
::Socket
::INET
->new(PeerAddr
=> $urlparts->[0],
351 PeerPort
=> $urlparts->[1],
354 # timeout or something, mark this device as down for now and move on
355 $self->broadcast_host_unreachable($dev->hostid);
356 $reschedule_fid->(60 * 60 * 2, "no_sock_to_hostid");
360 # send delete request
361 error
("Sending delete for $path") if $Mgd::DEBUG
>= 2;
363 $sock->write("DELETE $urlparts->[2] HTTP/1.0\r\n\r\n");
364 my $response = <$sock>;
365 if ($response =~ m!^HTTP/\d+\.\d+\s+(\d+)!) {
366 if (($1 >= 200 && $1 <= 299) || $1 == 404) {
367 # effectively means all went well
368 $done_with_devid->("deleted");
370 # remote file system error? mark node as down
372 error
("Error: unlink failure: $path: HTTP code $httpcode");
373 $reschedule_fid->(60 * 30, "http_code_$httpcode");
377 error
("Error: unknown response line deleting $path: $response");
381 # as far as we know, we have more work to do
385 sub reenqueue_delayed_deletes
{
388 my $sto = Mgd
::get_store
();
391 my @fidids = $sto->fids_to_delete_again
394 $sto->enqueue_fids_to_delete(@fidids);
396 $dbh->do("DELETE FROM file_to_delete_later WHERE fid IN (" .
397 join(",", @fidids) . ")");
398 $sto->condthrow("reenqueue file_to_delete_later delete");
406 # indent-tabs-mode: nil