lib/MogileFS/Store.pm

   1 package MogileFS::Store;
   2 use strict;
   3 use warnings;
   4 use Carp qw(croak);
   5 use MogileFS::Util qw(throw max error);
   6 use DBI;  # no reason a Store has to be DBI-based, but for now they all are.
   7 use List::Util ();
   8
   9 # this is incremented whenever the schema changes.  server will refuse
  10 # to start-up with an old schema version
  11 #
  12 # 6: adds file_to_replicate table
  13 # 7: adds file_to_delete_later table
  14 # 8: adds fsck_log table
  15 # 9: adds 'drain' state to enum in device table
  16 # 10: adds 'replpolicy' column to 'class' table
  17 # 11: adds 'file_to_queue' table
  18 # 12: adds 'file_to_delete2' table
  19 # 13: modifies 'server_settings.value' to TEXT for wider values
  20 #     also adds a TEXT 'arg' column to file_to_queue for passing arguments
  21 # 14: modifies 'device' mb_total, mb_used to INT for devs > 16TB
  22 use constant SCHEMA_VERSION => 14;
  23
  24 sub new {
  25     my ($class) = @_;
  26     return $class->new_from_dsn_user_pass(map { MogileFS->config($_) } qw(db_dsn db_user db_pass max_handles));
  27 }
  28
  29 sub new_from_dsn_user_pass {
  30     my ($class, $dsn, $user, $pass, $max_handles) = @_;
  31     my $subclass;
  32     if ($dsn =~ /^DBI:mysql:/i) {
  33         $subclass = "MogileFS::Store::MySQL";
  34     } elsif ($dsn =~ /^DBI:SQLite:/i) {
  35         $subclass = "MogileFS::Store::SQLite";
  36     } elsif ($dsn =~ /^DBI:Oracle:/i) {
  37         $subclass = "MogileFS::Store::Oracle";
  38     } elsif ($dsn =~ /^DBI:Pg:/i) {
  39         $subclass = "MogileFS::Store::Postgres";
  40     } else {
  41         die "Unknown database type: $dsn";
  42     }
  43     unless (eval "use $subclass; 1") {
  44         die "Error loading $subclass: $@\n";
  45     }
  46     my $self = bless {
  47         dsn    => $dsn,
  48         user   => $user,
  49         pass   => $pass,
  50         max_handles => $max_handles, # Max number of handles to allow
  51         raise_errors => $subclass->want_raise_errors,
  52         slave_list_cachetime => 0,
  53         slave_list_cache     => [],
  54         recheck_req_gen  => 0,  # incremented generation, of recheck of dbh being requested
  55         recheck_done_gen => 0,  # once recheck is done, copy of what the request generation was
  56         handles_left     => 0,  # amount of times this handle can still be verified
  57         server_setting_cache => {}, # value-agnostic db setting cache.
  58     }, $subclass;
  59     $self->init;
  60     return $self;
  61 }
  62
  63 # Defaults to true now.
  64 sub want_raise_errors {
  65     1;
  66 }
  67
  68 sub new_from_mogdbsetup {
  69     my ($class, %args) = @_;
  70     # where args is:  dbhost dbport dbname dbrootuser dbrootpass dbuser dbpass
  71     my $dsn = $class->dsn_of_dbhost($args{dbname}, $args{dbhost}, $args{dbport});
  72
  73     my $try_make_sto = sub {
  74         my $dbh = DBI->connect($dsn, $args{dbuser}, $args{dbpass}, {
  75             PrintError => 0,
  76         }) or return undef;
  77         my $sto = $class->new_from_dsn_user_pass($dsn, $args{dbuser}, $args{dbpass});
  78         $sto->raise_errors;
  79         return $sto;
  80     };
  81
  82     # upgrading, apparently, as this database already exists.
  83     my $sto = $try_make_sto->();
  84     return $sto if $sto;
  85
  86     # otherwise, we need to make the requested database, setup permissions, etc
  87     $class->status("couldn't connect to database as mogilefs user.  trying root...");
  88     my $rootdsn = $class->dsn_of_root($args{dbname}, $args{dbhost}, $args{dbport});
  89     my $rdbh = DBI->connect($rootdsn, $args{dbrootuser}, $args{dbrootpass}, {
  90         PrintError => 0,
  91     }) or
  92         die "Failed to connect to $rootdsn as specified root user ($args{dbrootuser}): " . DBI->errstr . "\n";
  93     $class->status("connected to database as root user.");
  94
  95     $class->confirm("Create/Upgrade database name '$args{dbname}'?");
  96     $class->create_db_if_not_exists($rdbh, $args{dbname});
  97     $class->confirm("Grant all privileges to user '$args{dbuser}', connecting from anywhere, to the mogilefs database '$args{dbname}'?");
  98     $class->grant_privileges($rdbh, $args{dbname}, $args{dbuser}, $args{dbpass});
  99
 100     # should be ready now:
 101     $sto = $try_make_sto->();
 102     return $sto if $sto;
 103
 104     die "Failed to connect to database as regular user, even after creating it and setting up permissions as the root user.";
 105 }
 106
 107 # given a root DBI connection, create the named database.  succeed
 108 # if it it's made, or already exists.  die otherwise.
 109 sub create_db_if_not_exists {
 110     my ($pkg, $rdbh, $dbname) = @_;
 111     $rdbh->do("CREATE DATABASE IF NOT EXISTS $dbname")
 112         or die "Failed to create database '$dbname': " . $rdbh->errstr . "\n";
 113 }
 114
 115 sub grant_privileges {
 116     my ($pkg, $rdbh, $dbname, $user, $pass) = @_;
 117     $rdbh->do("GRANT ALL PRIVILEGES ON $dbname.* TO $user\@'\%' IDENTIFIED BY ?",
 118              undef, $pass)
 119         or die "Failed to grant privileges: " . $rdbh->errstr . "\n";
 120     $rdbh->do("GRANT ALL PRIVILEGES ON $dbname.* TO $user\@'localhost' IDENTIFIED BY ?",
 121              undef, $pass)
 122         or die "Failed to grant privileges: " . $rdbh->errstr . "\n";
 123 }
 124
 125 sub can_replace      { 0 }
 126 sub can_insertignore { 0 }
 127 sub can_insert_multi { 0 }
 128 sub can_for_update   { 1 }
 129
 130 sub unix_timestamp { die "No function in $_[0] to return DB's unixtime." }
 131
 132 sub ignore_replace {
 133     my $self = shift;
 134     return "INSERT IGNORE " if $self->can_insertignore;
 135     return "REPLACE " if $self->can_replace;
 136     die "Can't INSERT IGNORE or REPLACE?";
 137 }
 138
 139 my $on_status = sub {};
 140 my $on_confirm = sub { 1 };
 141 sub on_status  { my ($pkg, $code) = @_; $on_status  = $code; };
 142 sub on_confirm { my ($pkg, $code) = @_; $on_confirm = $code; };
 143 sub status     { my ($pkg, $msg)  = @_; $on_status->($msg);  };
 144 sub confirm    { my ($pkg, $msg)  = @_; $on_confirm->($msg) or die "Aborted.\n"; };
 145
 146 sub latest_schema_version { SCHEMA_VERSION }
 147
 148 sub raise_errors {
 149     my $self = shift;
 150     $self->{raise_errors} = 1;
 151     $self->dbh->{RaiseError} = 1;
 152 }
 153
 154 sub dsn  { $_[0]{dsn}  }
 155 sub user { $_[0]{user} }
 156 sub pass { $_[0]{pass} }
 157
 158 sub init { 1 }
 159 sub post_dbi_connect { 1 }
 160
 161 sub can_do_slaves { 0 }
 162
 163 sub mark_as_slave {
 164     my $self = shift;
 165     die "Incapable of becoming slave." unless $self->can_do_slaves;
 166
 167     $self->{slave} = 1;
 168 }
 169
 170 sub is_slave {
 171     my $self = shift;
 172     return $self->{slave};
 173 }
 174
 175 # Returns a list of arrayrefs, each being [$dsn, $username, $password] for connecting to a slave DB.
 176 sub _slaves_list {
 177     my $self = shift;
 178     my $now = time();
 179
 180     # only reload every 15 seconds.
 181     if ($self->{slave_list_cachetime} > $now - 15) {
 182         return @{$self->{slave_list_cache}};
 183     }
 184     $self->{slave_list_cachetime} = $now;
 185     $self->{slave_list_cache}     = [];
 186
 187     my $sk = MogileFS::Config->server_setting('slave_keys')
 188         or return ();
 189
 190     my @ret;
 191     foreach my $key (split /\s*,\s*/, $sk) {
 192         my $slave = MogileFS::Config->server_setting("slave_$key");
 193
 194         if (!$slave) {
 195             error("key for slave DB config: slave_$key not found in configuration");
 196             next;
 197         }
 198
 199         my ($dsn, $user, $pass) = split /\|/, $slave;
 200         if (!defined($dsn) or !defined($user) or !defined($pass)) {
 201             error("key slave_$key contains $slave, which doesn't split in | into DSN|user|pass - ignoring");
 202             next;
 203         }
 204         push @ret, [$dsn, $user, $pass]
 205     }
 206
 207     $self->{slave_list_cache}     = \@ret;
 208     return @ret;
 209 }
 210
 211 sub get_slave {
 212     my $self = shift;
 213
 214     die "Incapable of having slaves." unless $self->can_do_slaves;
 215
 216     return $self->{slave} if $self->check_slave;
 217
 218     my @slaves_list = $self->_slaves_list;
 219
 220     # If we have no slaves, then return silently.
 221     return unless @slaves_list;
 222
 223     foreach my $slave_fulldsn (@slaves_list) {
 224         my $newslave = $self->{slave} = $self->new_from_dsn_user_pass(@$slave_fulldsn);
 225         $self->{slave_next_check} = 0;
 226         $newslave->mark_as_slave;
 227         return $newslave
 228             if $self->check_slave;
 229     }
 230
 231     warn "Slave list exhausted, failing back to master.";
 232     return;
 233 }
 234
 235 sub read_store {
 236     my $self = shift;
 237
 238     return $self unless $self->can_do_slaves;
 239
 240     if ($self->{slave_ok}) {
 241         if (my $slave = $self->get_slave) {
 242             $slave->{recheck_req_gen} = $self->{recheck_req_gen};
 243             return $slave;
 244         }
 245     }
 246
 247     return $self;
 248 }
 249
 250 sub slaves_ok {
 251     my $self = shift;
 252     my $coderef = shift;
 253
 254     return unless ref $coderef eq 'CODE';
 255
 256     local $self->{slave_ok} = 1;
 257
 258     return $coderef->(@_);
 259 }
 260
 261 sub recheck_dbh {
 262     my $self = shift;
 263     $self->{recheck_req_gen}++;
 264 }
 265
 266 sub dbh {
 267     my $self = shift;
 268
 269     if ($self->{dbh}) {
 270         if ($self->{recheck_done_gen} != $self->{recheck_req_gen}) {
 271             $self->{dbh} = undef unless $self->{dbh}->ping;
 272             # Handles a memory leak under Solaris/Postgres.
 273             $self->{dbh} = undef if ($self->{max_handles} &&
 274                 $self->{handles_left}-- < 0);
 275             $self->{recheck_done_gen} = $self->{recheck_req_gen};
 276         }
 277         return $self->{dbh} if $self->{dbh};
 278     }
 279
 280     $self->{dbh} = DBI->connect($self->{dsn}, $self->{user}, $self->{pass}, {
 281         PrintError => 0,
 282         AutoCommit => 1,
 283         # FUTURE: will default to on (have to validate all callers first):
 284         RaiseError => ($self->{raise_errors} || 0),
 285     }) or
 286         die "Failed to connect to database: " . DBI->errstr;
 287     $self->post_dbi_connect;
 288     $self->{handles_left} = $self->{max_handles} if $self->{max_handles};
 289     return $self->{dbh};
 290 }
 291
 292 sub ping {
 293     my $self = shift;
 294     return $self->dbh->ping;
 295 }
 296
 297 sub condthrow {
 298     my ($self, $optmsg) = @_;
 299     my $dbh = $self->dbh;
 300     return unless $dbh->err;
 301     my ($pkg, $fn, $line) = caller;
 302     my $msg = "Database error from $pkg/$fn/$line: " . $dbh->errstr;
 303     $msg .= ": $optmsg" if $optmsg;
 304     # Auto rollback failures around transactions.
 305     if ($dbh->{AutoCommit} == 0) { eval { $dbh->rollback }; }
 306     croak($msg);
 307 }
 308
 309 sub dowell {
 310     my ($self, $sql, @do_params) = @_;
 311     my $rv = eval { $self->dbh->do($sql, @do_params) };
 312     return $rv unless $@ || $self->dbh->err;
 313     warn "Error with SQL: $sql\n";
 314     Carp::confess($@ || $self->dbh->errstr);
 315 }
 316
 317 sub _valid_params {
 318     croak("Odd number of parameters!") if scalar(@_) % 2;
 319     my ($self, $vlist, %uarg) = @_;
 320     my %ret;
 321     $ret{$_} = delete $uarg{$_} foreach @$vlist;
 322     croak("Bogus options: ".join(',',keys %uarg)) if %uarg;
 323     return %ret;
 324 }
 325
 326 sub was_deadlock_error {
 327     my $self = shift;
 328     my $dbh = $self->dbh;
 329     die "UNIMPLEMENTED";
 330 }
 331
 332 sub was_duplicate_error {
 333     my $self = shift;
 334     my $dbh = $self->dbh;
 335     die "UNIMPLEMENTED";
 336 }
 337
 338 # run a subref (presumably a database update) in an eval, because you expect it to
 339 # maybe fail on duplicate key error, and throw a dup exception for you, else return
 340 # its return value
 341 sub conddup {
 342     my ($self, $code) = @_;
 343     my $rv = eval { $code->(); };
 344     throw("dup") if $self->was_duplicate_error;
 345     croak($@) if $@;
 346     return $rv;
 347 }
 348
 349 # insert row if doesn't already exist
 350 # WARNING: This function is NOT transaction safe if the duplicate errors causes
 351 # your transaction to halt!
 352 # WARNING: This function is NOT safe on multi-row inserts if can_insertignore
 353 # is false! Rows before the duplicate will be inserted, but rows after the
 354 # duplicate might not be, depending your database.
 355 sub insert_ignore {
 356     my ($self, $sql, @params) = @_;
 357     my $dbh = $self->dbh;
 358     if ($self->can_insertignore) {
 359         return $dbh->do("INSERT IGNORE $sql", @params);
 360     } else {
 361         # TODO: Detect bad multi-row insert here.
 362         my $rv = eval { $dbh->do("INSERT $sql", @params); };
 363         if ($@ || $dbh->err) {
 364             return 1 if $self->was_duplicate_error;
 365             # This chunk is identical to condthrow, but we include it directly
 366             # here as we know there is definitely an error, and we would like
 367             # the caller of this function.
 368             my ($pkg, $fn, $line) = caller;
 369             my $msg = "Database error from $pkg/$fn/$line: " . $dbh->errstr;
 370             croak($msg);
 371         }
 372         return $rv;
 373     }
 374 }
 375
 376 sub retry_on_deadlock {
 377     my $self  = shift;
 378     my $code  = shift;
 379     my $tries = shift || 3;
 380     croak("deadlock retries must be positive") if $tries < 1;
 381     my $rv;
 382
 383     while ($tries-- > 0) {
 384         $rv = eval { $code->(); };
 385         next if ($self->was_deadlock_error);
 386         croak($@) if $@;
 387         last;
 388     }
 389     return $rv;
 390 }
 391
 392 # --------------------------------------------------------------------------
 393
 394 my @extra_tables;
 395
 396 sub add_extra_tables {
 397     my $class = shift;
 398     push @extra_tables, @_;
 399 }
 400
 401 use constant TABLES => qw( domain class file tempfile file_to_delete
 402                             unreachable_fids file_on file_on_corrupt host
 403                             device server_settings file_to_replicate
 404                             file_to_delete_later fsck_log file_to_queue
 405                             file_to_delete2 );
 406
 407 sub setup_database {
 408     my $sto = shift;
 409
 410     my $curver = $sto->schema_version;
 411
 412     my $latestver = SCHEMA_VERSION;
 413     if ($curver == $latestver) {
 414         $sto->status("Schema already up-to-date at version $curver.");
 415         return 1;
 416     }
 417
 418     if ($curver > $latestver) {
 419         die "Your current schema version is $curver, but this version of mogdbsetup only knows up to $latestver.  Aborting to be safe.\n";
 420     }
 421
 422     if ($curver) {
 423         $sto->confirm("Install/upgrade your schema from version $curver to version $latestver?");
 424     }
 425
 426     foreach my $t (TABLES, @extra_tables) {
 427         $sto->create_table($t);
 428     }
 429
 430     $sto->upgrade_add_host_getport;
 431     $sto->upgrade_add_host_altip;
 432     $sto->upgrade_add_device_asof;
 433     $sto->upgrade_add_device_weight;
 434     $sto->upgrade_add_device_readonly;
 435     $sto->upgrade_add_device_drain;
 436     $sto->upgrade_add_class_replpolicy;
 437     $sto->upgrade_modify_server_settings_value;
 438     $sto->upgrade_add_file_to_queue_arg;
 439     $sto->upgrade_modify_device_size;
 440
 441     return 1;
 442 }
 443
 444 sub cached_schema_version {
 445     my $self = shift;
 446     return $self->{_cached_schema_version} ||=
 447         $self->schema_version;
 448 }
 449
 450 sub schema_version {
 451     my $self = shift;
 452     my $dbh = $self->dbh;
 453     return eval {
 454         $dbh->selectrow_array("SELECT value FROM server_settings WHERE field='schema_version'") || 0;
 455     } || 0;
 456 }
 457
 458 sub filter_create_sql { my ($self, $sql) = @_; return $sql; }
 459
 460 sub create_table {
 461     my ($self, $table) = @_;
 462     my $dbh = $self->dbh;
 463     return 1 if $self->table_exists($table);
 464     my $meth = "TABLE_$table";
 465     my $sql = $self->$meth;
 466     $sql = $self->filter_create_sql($sql);
 467     $self->status("Running SQL: $sql;");
 468     $dbh->do($sql) or
 469         die "Failed to create table $table: " . $dbh->errstr;
 470     my $imeth = "INDEXES_$table";
 471     my @indexes = eval { $self->$imeth };
 472     foreach $sql (@indexes) {
 473         $self->status("Running SQL: $sql;");
 474         $dbh->do($sql) or
 475             die "Failed to create indexes on $table: " . $dbh->errstr;
 476     }
 477 }
 478
 479 # Please try to keep all tables aligned nicely
 480 # with '"CREATE TABLE' on the first line
 481 # and ')"' alone on the last line.
 482
 483 sub TABLE_domain {
 484     # classes are tied to domains.  domains can have classes of items
 485     # with different mindevcounts.
 486     #
 487     # a minimum devcount is the number of copies the system tries to
 488     # maintain for files in that class
 489     #
 490     # unspecified classname means classid=0 (implicit class), and that
 491     # implies mindevcount=2
 492     "CREATE TABLE domain (
 493     dmid         SMALLINT UNSIGNED NOT NULL PRIMARY KEY,
 494     namespace    VARCHAR(255),
 495     UNIQUE (namespace)
 496     )"
 497 }
 498
 499 sub TABLE_class {
 500     "CREATE TABLE class (
 501     dmid          SMALLINT UNSIGNED NOT NULL,
 502     classid       TINYINT UNSIGNED NOT NULL,
 503     PRIMARY KEY (dmid,classid),
 504     classname     VARCHAR(50),
 505     UNIQUE      (dmid,classname),
 506     mindevcount   TINYINT UNSIGNED NOT NULL
 507     )"
 508 }
 509
 510 # the length field is only here for easy verifications of content
 511 # integrity when copying around.  no sums or content types or other
 512 # metadata here.  application can handle that.
 513 #
 514 # classid is what class of file this belongs to.  for instance, on fotobilder
 515 # there will be a class for original pictures (the ones the user uploaded)
 516 # and a class for derived images (scaled down versions, thumbnails, greyscale, etc)
 517 # each domain can setup classes and assign the minimum redundancy level for
 518 # each class.  fotobilder will use a 2 or 3 minimum copy redundancy for original
 519 # photos and and a 1 minimum for derived images (which means the sole device
 520 # for a derived image can die, bringing devcount to 0 for that file, but
 521 # the application can recreate it from its original)
 522 sub TABLE_file {
 523     "CREATE TABLE file (
 524     fid          INT UNSIGNED NOT NULL,
 525     PRIMARY KEY  (fid),
 526
 527     dmid          SMALLINT UNSIGNED NOT NULL,
 528     dkey           VARCHAR(255),     # domain-defined
 529     UNIQUE dkey  (dmid, dkey),
 530
 531     length        BIGINT UNSIGNED,   # big limit
 532
 533     classid       TINYINT UNSIGNED NOT NULL,
 534     devcount      TINYINT UNSIGNED NOT NULL,
 535     INDEX devcount (dmid,classid,devcount)
 536     )"
 537 }
 538
 539 sub TABLE_tempfile {
 540     "CREATE TABLE tempfile (
 541     fid          INT UNSIGNED NOT NULL AUTO_INCREMENT,
 542     PRIMARY KEY  (fid),
 543
 544     createtime   INT UNSIGNED NOT NULL,
 545     classid      TINYINT UNSIGNED NOT NULL,
 546     dmid          SMALLINT UNSIGNED NOT NULL,
 547     dkey           VARCHAR(255),
 548     devids       VARCHAR(60)
 549     )"
 550 }
 551
 552 # files marked for death when their key is overwritten.  then they get a new
 553 # fid, but since the old row (with the old fid) had to be deleted immediately,
 554 # we need a place to store the fid so an async job can delete the file from
 555 # all devices.
 556 sub TABLE_file_to_delete {
 557     "CREATE TABLE file_to_delete (
 558     fid  INT UNSIGNED NOT NULL,
 559     PRIMARY KEY (fid)
 560     )"
 561 }
 562
 563 # if the replicator notices that a fid has no sources, that file gets inserted
 564 # into the unreachable_fids table.  it is up to the application to actually
 565 # handle fids stored in this table.
 566 sub TABLE_unreachable_fids {
 567     "CREATE TABLE unreachable_fids (
 568     fid        INT UNSIGNED NOT NULL,
 569     lastupdate INT UNSIGNED NOT NULL,
 570     PRIMARY KEY (fid),
 571     INDEX (lastupdate)
 572     )"
 573 }
 574
 575 # what files are on what devices?  (most likely physical devices,
 576 # as logical devices of RAID arrays would be costly, and mogilefs
 577 # already handles redundancy)
 578 #
 579 # the devid index lets us answer "What files were on this now-dead disk?"
 580 sub TABLE_file_on {
 581     "CREATE TABLE file_on (
 582     fid          INT UNSIGNED NOT NULL,
 583     devid        MEDIUMINT UNSIGNED NOT NULL,
 584     PRIMARY KEY (fid, devid),
 585     INDEX (devid)
 586     )"
 587 }
 588
 589 # if application or framework detects an error in one of the duplicate files
 590 # for whatever reason, it can register its complaint and the framework
 591 # will do some verifications and fix things up w/ an async job
 592 # MAYBE: let application tell us the SHA1/MD5 of the file for us to check
 593 #        on the other devices?
 594 sub TABLE_file_on_corrupt {
 595     "CREATE TABLE file_on_corrupt (
 596     fid          INT UNSIGNED NOT NULL,
 597     devid        MEDIUMINT UNSIGNED NOT NULL,
 598     PRIMARY KEY (fid, devid)
 599     )"
 600 }
 601
 602 # hosts (which contain devices...)
 603 sub TABLE_host {
 604     "CREATE TABLE host (
 605     hostid     MEDIUMINT UNSIGNED NOT NULL PRIMARY KEY,
 606
 607     status     ENUM('alive','dead','down'),
 608     http_port  MEDIUMINT UNSIGNED DEFAULT 7500,
 609     http_get_port MEDIUMINT UNSIGNED,
 610
 611     hostname   VARCHAR(40),
 612     hostip     VARCHAR(15),
 613     altip      VARCHAR(15),
 614     altmask    VARCHAR(18),
 615     UNIQUE     (hostname),
 616     UNIQUE     (hostip),
 617     UNIQUE     (altip)
 618     )"
 619 }
 620
 621 # disks...
 622 sub TABLE_device {
 623     "CREATE TABLE device (
 624     devid   MEDIUMINT UNSIGNED NOT NULL,
 625     hostid     MEDIUMINT UNSIGNED NOT NULL,
 626
 627     status  ENUM('alive','dead','down'),
 628     weight  MEDIUMINT DEFAULT 100,
 629
 630     mb_total   INT UNSIGNED,
 631     mb_used    INT UNSIGNED,
 632     mb_asof    INT UNSIGNED,
 633     PRIMARY KEY (devid),
 634     INDEX   (status)
 635     )"
 636 }
 637
 638 sub TABLE_server_settings {
 639     "CREATE TABLE server_settings (
 640     field   VARCHAR(50) PRIMARY KEY,
 641     value   TEXT
 642     )"
 643 }
 644
 645 sub TABLE_file_to_replicate {
 646     # nexttry is time to try to replicate it next.
 647     #   0 means immediate.  it's only on one host.
 648     #   1 means lower priority.  it's on 2+ but isn't happy where it's at.
 649     #   unix timestamp means at/after that time.  some previous error occurred.
 650     # fromdevid, if not null, means which devid we should replicate from.  perhaps it's the only non-corrupt one.  otherwise, wherever.
 651     # failcount.  how many times we've failed, just for doing backoff of nexttry.
 652     # flags.  reserved for future use.
 653     "CREATE TABLE file_to_replicate (
 654     fid        INT UNSIGNED NOT NULL PRIMARY KEY,
 655     nexttry    INT UNSIGNED NOT NULL,
 656     INDEX (nexttry),
 657     fromdevid  INT UNSIGNED,
 658     failcount  TINYINT UNSIGNED NOT NULL DEFAULT 0,
 659     flags      SMALLINT UNSIGNED NOT NULL DEFAULT 0
 660     )"
 661 }
 662
 663 sub TABLE_file_to_delete_later {
 664     "CREATE TABLE file_to_delete_later (
 665     fid  INT UNSIGNED NOT NULL PRIMARY KEY,
 666     delafter INT UNSIGNED NOT NULL,
 667     INDEX (delafter)
 668     )"
 669 }
 670
 671 sub TABLE_fsck_log {
 672     "CREATE TABLE fsck_log (
 673     logid  INT UNSIGNED NOT NULL AUTO_INCREMENT,
 674     PRIMARY KEY (logid),
 675     utime  INT UNSIGNED NOT NULL,
 676     fid    INT UNSIGNED NULL,
 677     evcode CHAR(4),
 678     devid  MEDIUMINT UNSIGNED,
 679     INDEX(utime)
 680     )"
 681 }
 682
 683 # generic queue table, designed to be used for workers/jobs which aren't
 684 # constantly in use, and are async to the user.
 685 # ie; fsck, drain, rebalance.
 686 sub TABLE_file_to_queue {
 687     "CREATE TABLE file_to_queue (
 688     fid       INT UNSIGNED NOT NULL,
 689     devid     INT UNSIGNED,
 690     type      TINYINT UNSIGNED NOT NULL,
 691     nexttry   INT UNSIGNED NOT NULL,
 692     failcount TINYINT UNSIGNED NOT NULL default '0',
 693     flags     SMALLINT UNSIGNED NOT NULL default '0',
 694     arg       TEXT,
 695     PRIMARY KEY (fid, type),
 696     INDEX type_nexttry (type,nexttry)
 697     )"
 698 }
 699
 700 # new style async delete table.
 701 # this is separate from file_to_queue since deletes are more actively used,
 702 # and partitioning on 'type' doesn't always work so well.
 703 sub TABLE_file_to_delete2 {
 704     "CREATE TABLE file_to_delete2 (
 705     fid INT UNSIGNED NOT NULL PRIMARY KEY,
 706     nexttry INT UNSIGNED NOT NULL,
 707     failcount TINYINT UNSIGNED NOT NULL default '0',
 708     INDEX nexttry (nexttry)
 709     )"
 710 }
 711
 712 # these five only necessary for MySQL, since no other database existed
 713 # before, so they can just create the tables correctly to begin with.
 714 # in the future, there might be new alters that non-MySQL databases
 715 # will have to implement.
 716 sub upgrade_add_host_getport { 1 }
 717 sub upgrade_add_host_altip { 1 }
 718 sub upgrade_add_device_asof { 1 }
 719 sub upgrade_add_device_weight { 1 }
 720 sub upgrade_add_device_readonly { 1 }
 721 sub upgrade_add_device_drain { die "Not implemented in $_[0]" }
 722 sub upgrade_modify_server_settings_value { die "Not implemented in $_[0]" }
 723 sub upgrade_add_file_to_queue_arg { die "Not implemented in $_[0]" }
 724 sub upgrade_modify_device_size { die "Not implemented in $_[0]" }
 725
 726 sub upgrade_add_class_replpolicy {
 727     my ($self) = @_;
 728     unless ($self->column_type("class", "replpolicy")) {
 729         $self->dowell("ALTER TABLE class ADD COLUMN replpolicy VARCHAR(255)");
 730     }
 731 }
 732
 733 # return true if deleted, 0 if didn't exist, exception if error
 734 sub delete_host {
 735     my ($self, $hostid) = @_;
 736     return $self->dbh->do("DELETE FROM host WHERE hostid = ?", undef, $hostid);
 737 }
 738
 739 # return true if deleted, 0 if didn't exist, exception if error
 740 sub delete_domain {
 741     my ($self, $dmid) = @_;
 742     throw("has_files")   if $self->domain_has_files($dmid);
 743     throw("has_classes") if $self->domain_has_classes($dmid);
 744     return $self->dbh->do("DELETE FROM domain WHERE dmid = ?", undef, $dmid);
 745 }
 746
 747 sub domain_has_files {
 748     my ($self, $dmid) = @_;
 749     my $has_a_fid = $self->dbh->selectrow_array('SELECT fid FROM file WHERE dmid = ? LIMIT 1',
 750                                                 undef, $dmid);
 751     return $has_a_fid ? 1 : 0;
 752 }
 753
 754 sub domain_has_classes {
 755     my ($self, $dmid) = @_;
 756     my $has_a_class = $self->dbh->selectrow_array('SELECT classid FROM class WHERE dmid = ? LIMIT 1',
 757         undef, $dmid);
 758     return $has_a_class ? 1 : 0;
 759 }
 760
 761 sub class_has_files {
 762     my ($self, $dmid, $clid) = @_;
 763     my $has_a_fid = $self->dbh->selectrow_array('SELECT fid FROM file WHERE dmid = ? AND classid = ? LIMIT 1',
 764                                                 undef, $dmid, $clid);
 765     return $has_a_fid ? 1 : 0;
 766 }
 767
 768 # return new classid on success (non-zero integer), die on failure
 769 # throw 'dup' on duplicate name
 770 # override this if you want a less racy version.
 771 sub create_class {
 772     my ($self, $dmid, $classname) = @_;
 773     my $dbh = $self->dbh;
 774
 775     # get the max class id in this domain
 776     my $maxid = $dbh->selectrow_array
 777         ('SELECT MAX(classid) FROM class WHERE dmid = ?', undef, $dmid) || 0;
 778
 779     # now insert the new class
 780     my $rv = eval {
 781         $dbh->do("INSERT INTO class (dmid, classid, classname, mindevcount) VALUES (?, ?, ?, ?)",
 782                  undef, $dmid, $maxid + 1, $classname, 2);
 783     };
 784     if ($@ || $dbh->err) {
 785         if ($self->was_duplicate_error) {
 786             throw("dup");
 787         }
 788     }
 789     return $maxid + 1 if $rv;
 790     $self->condthrow;
 791     die;
 792 }
 793
 794 # return 1 on success, throw "dup" on duplicate name error, die otherwise
 795 sub update_class_name {
 796     my $self = shift;
 797     my %arg  = $self->_valid_params([qw(dmid classid classname)], @_);
 798     my $rv = eval {
 799         $self->dbh->do("UPDATE class SET classname=? WHERE dmid=? AND classid=?",
 800                        undef, $arg{classname}, $arg{dmid}, $arg{classid});
 801     };
 802     throw("dup") if $self->was_duplicate_error;
 803     $self->condthrow;
 804     return 1;
 805 }
 806
 807 # return 1 on success, die otherwise
 808 sub update_class_mindevcount {
 809     my $self = shift;
 810     my %arg  = $self->_valid_params([qw(dmid classid mindevcount)], @_);
 811     eval {
 812     $self->dbh->do("UPDATE class SET mindevcount=? WHERE dmid=? AND classid=?",
 813                    undef, $arg{mindevcount}, $arg{dmid}, $arg{classid});
 814     };
 815     $self->condthrow;
 816     return 1;
 817 }
 818
 819 # return 1 on success, die otherwise
 820 sub update_class_replpolicy {
 821     my $self = shift;
 822     my %arg  = $self->_valid_params([qw(dmid classid replpolicy)], @_);
 823     eval {
 824     $self->dbh->do("UPDATE class SET replpolicy=? WHERE dmid=? AND classid=?",
 825                    undef, $arg{replpolicy}, $arg{dmid}, $arg{classid});
 826     };
 827     $self->condthrow;
 828     return 1;
 829 }
 830
 831 sub nfiles_with_dmid_classid_devcount {
 832     my ($self, $dmid, $classid, $devcount) = @_;
 833     return $self->dbh->selectrow_array('SELECT COUNT(*) FROM file WHERE dmid = ? AND classid = ? AND devcount = ?',
 834                                        undef, $dmid, $classid, $devcount);
 835 }
 836
 837 sub set_server_setting {
 838     my ($self, $key, $val) = @_;
 839     my $dbh = $self->dbh;
 840     die "Your database does not support REPLACE! Reimplement set_server_setting!" unless $self->can_replace;
 841
 842     eval {
 843         if (defined $val) {
 844             $dbh->do("REPLACE INTO server_settings (field, value) VALUES (?, ?)", undef, $key, $val);
 845         } else {
 846             $dbh->do("DELETE FROM server_settings WHERE field=?", undef, $key);
 847         }
 848     };
 849
 850     die "Error updating 'server_settings': " . $dbh->errstr if $dbh->err;
 851     return 1;
 852 }
 853
 854 # FIXME: racy.  currently the only caller doesn't matter, but should be fixed.
 855 sub incr_server_setting {
 856     my ($self, $key, $val) = @_;
 857     $val = 1 unless defined $val;
 858     return unless $val;
 859
 860     return 1 if $self->dbh->do("UPDATE server_settings ".
 861                                "SET value=value+? ".
 862                                "WHERE field=?", undef,
 863                                $val, $key) > 0;
 864     $self->set_server_setting($key, $val);
 865 }
 866
 867 sub server_setting {
 868     my ($self, $key) = @_;
 869     return $self->dbh->selectrow_array("SELECT value FROM server_settings WHERE field=?",
 870                                        undef, $key);
 871 }
 872
 873 # generic server setting cache.
 874 # note that you can call the same server setting with different timeouts, but
 875 # the timeout specified at the time of ... timeout, wins.
 876 sub server_setting_cached {
 877     my ($self, $key, $timeout) = @_;
 878     $self->{server_setting_cache}->{$key} ||= {val => '', refresh => 0};
 879     my $cache = $self->{server_setting_cache}->{$key};
 880     my $now = time();
 881     if ($now > $cache->{refresh}) {
 882         $cache->{val}     = $self->server_setting($key);
 883         $cache->{refresh} = $now + $timeout;
 884     }
 885     return $cache->{val};
 886 }
 887
 888 sub server_settings {
 889     my ($self) = @_;
 890     my $ret = {};
 891     my $sth = $self->dbh->prepare("SELECT field, value FROM server_settings");
 892     $sth->execute;
 893     while (my ($k, $v) = $sth->fetchrow_array) {
 894         $ret->{$k} = $v;
 895     }
 896     return $ret;
 897 }
 898
 899 # register a tempfile and return the fidid, which should be allocated
 900 # using autoincrement/sequences if the passed in fid is undef.  however,
 901 # if fid is passed in, that value should be used and returned.
 902 #
 903 # return new/passed in fidid on success.
 904 # throw 'dup' if fid already in use
 905 # return 0/undef/die on failure
 906 #
 907 sub register_tempfile {
 908     my $self = shift;
 909     my %arg  = $self->_valid_params([qw(fid dmid key classid devids)], @_);
 910
 911     my $dbh = $self->dbh;
 912     my $fid = $arg{fid};
 913
 914     my $explicit_fid_used = $fid ? 1 : 0;
 915
 916     # setup the new mapping.  we store the devices that we picked for
 917     # this file in here, knowing that they might not be used.  create_close
 918     # is responsible for actually mapping in file_on.  NOTE: fid is being
 919     # passed in, it's either some number they gave us, or it's going to be
 920     # 0/undef which translates into NULL which means to automatically create
 921     # one.  that should be fine.
 922     my $ins_tempfile = sub {
 923         my $rv = eval {
 924             # We must only pass the correct number of bind parameters
 925             # Using 'NULL' for the AUTO_INCREMENT/SERIAL column will fail on
 926             # Postgres, where you are expected to leave it out or use DEFAULT
 927             # Leaving it out seems sanest and least likely to cause problems
 928             # with other databases.
 929             my @keys = ('dmid', 'dkey', 'classid', 'devids', 'createtime');
 930             my @vars = ('?'   , '?'   , '?'      , '?'     , $self->unix_timestamp);
 931             my @vals = ($arg{dmid}, $arg{key}, $arg{classid} || 0, $arg{devids});
 932             # Do not check for $explicit_fid_used, but rather $fid directly
 933             # as this anonymous sub is called from the loop later
 934             if($fid) {
 935                 unshift @keys, 'fid';
 936                 unshift @vars, '?';
 937                 unshift @vals, $fid;
 938             }
 939             my $sql = "INSERT INTO tempfile (".join(',',@keys).") VALUES (".join(',',@vars).")";
 940             $dbh->do($sql, undef, @vals);
 941         };
 942         if (!$rv) {
 943             return undef if $self->was_duplicate_error;
 944             die "Unexpected db error into tempfile: " . $dbh->errstr;
 945         }
 946
 947         unless (defined $fid) {
 948             # if they did not give us a fid, then we want to grab the one that was
 949             # theoretically automatically generated
 950             $fid = $dbh->last_insert_id(undef, undef, 'tempfile', 'fid')
 951                 or die "No last_insert_id found";
 952         }
 953         return undef unless defined $fid && $fid > 0;
 954         return 1;
 955     };
 956
 957     unless ($ins_tempfile->()) {
 958         throw("dup") if $explicit_fid_used;
 959         die "tempfile insert failed";
 960     }
 961
 962     my $fid_in_use = sub {
 963         my $exists = $dbh->selectrow_array("SELECT COUNT(*) FROM file WHERE fid=?", undef, $fid);
 964         return $exists ? 1 : 0;
 965     };
 966
 967     # if the fid is in use, do something
 968     while ($fid_in_use->($fid)) {
 969         throw("dup") if $explicit_fid_used;
 970
 971         # be careful of databases which reset their
 972         # auto-increment/sequences when the table is empty (InnoDB
 973         # did/does this, for instance).  So check if it's in use, and
 974         # re-seed the table with the highest known fid from the file
 975         # table.
 976
 977         # get the highest fid from the filetable and insert a dummy row
 978         $fid = $dbh->selectrow_array("SELECT MAX(fid) FROM file");
 979         $ins_tempfile->();  # don't care about its result
 980
 981         # then do a normal auto-increment
 982         $fid = undef;
 983         $ins_tempfile->() or die "register_tempfile failed after seeding";
 984     }
 985
 986     return $fid;
 987 }
 988
 989 # return hashref of row containing columns "fid, dmid, dkey, length,
 990 # classid, devcount" provided a $dmid and $key (dkey).  or undef if no
 991 # row.
 992 sub file_row_from_dmid_key {
 993     my ($self, $dmid, $key) = @_;
 994     return $self->dbh->selectrow_hashref("SELECT fid, dmid, dkey, length, classid, devcount ".
 995                                          "FROM file WHERE dmid=? AND dkey=?",
 996                                          undef, $dmid, $key);
 997 }
 998
 999 # return hashref of row containing columns "fid, dmid, dkey, length,
1000 # classid, devcount" provided a $fidid or undef if no row.
1001 sub file_row_from_fidid {
1002     my ($self, $fidid) = @_;
1003     return $self->dbh->selectrow_hashref("SELECT fid, dmid, dkey, length, classid, devcount ".
1004                                          "FROM file WHERE fid=?",
1005                                          undef, $fidid);
1006 }
1007
1008 # return an arrayref of rows containing columns "fid, dmid, dkey, length,
1009 # classid, devcount" provided a pair of $fidid or undef if no rows.
1010 sub file_row_from_fidid_range {
1011     my ($self, $fromfid, $count) = @_;
1012     my $sth = $self->dbh->prepare("SELECT fid, dmid, dkey, length, classid, devcount ".
1013                                   "FROM file WHERE fid > ? LIMIT ?");
1014     $sth->execute($fromfid,$count);
1015     return $sth->fetchall_arrayref({});
1016 }
1017
1018 # return array of devids that a fidid is on
1019 sub fid_devids {
1020     my ($self, $fidid) = @_;
1021     return @{ $self->dbh->selectcol_arrayref("SELECT devid FROM file_on WHERE fid=?",
1022                                              undef, $fidid) || [] };
1023 }
1024
1025 # return hashref of { $fidid => [ $devid, $devid... ] } for a bunch of given @fidids
1026 sub fid_devids_multiple {
1027     my ($self, @fidids) = @_;
1028     my $in = join(",", map { $_+0 } @fidids);
1029     my $ret = {};
1030     my $sth = $self->dbh->prepare("SELECT fid, devid FROM file_on WHERE fid IN ($in)");
1031     $sth->execute;
1032     while (my ($fidid, $devid) = $sth->fetchrow_array) {
1033         push @{$ret->{$fidid} ||= []}, $devid;
1034     }
1035     return $ret;
1036 }
1037
1038 # return hashref of columns classid, dmid, dkey, given a $fidid, or return undef
1039 sub tempfile_row_from_fid {
1040     my ($self, $fidid) = @_;
1041     return $self->dbh->selectrow_hashref("SELECT classid, dmid, dkey, devids ".
1042                                          "FROM tempfile WHERE fid=?",
1043                                          undef, $fidid);
1044 }
1045
1046 # return 1 on success, throw "dup" on duplicate devid or throws other error on failure
1047 sub create_device {
1048     my ($self, $devid, $hostid, $status) = @_;
1049     my $rv = $self->conddup(sub {
1050         $self->dbh->do("INSERT INTO device (devid, hostid, status) VALUES (?,?,?)", undef,
1051                        $devid, $hostid, $status);
1052     });
1053     $self->condthrow;
1054     die "error making device $devid\n" unless $rv > 0;
1055     return 1;
1056 }
1057
1058 sub update_device {
1059     my ($self, $devid, $to_update) = @_;
1060     my @keys = sort keys %$to_update;
1061     return unless @keys;
1062     $self->conddup(sub {
1063         $self->dbh->do("UPDATE device SET " . join('=?, ', @keys)
1064             . "=? WHERE devid=?", undef, (map { $to_update->{$_} } @keys),
1065             $devid);
1066     });
1067     return 1;
1068 }
1069
1070 sub update_device_usage {
1071     my $self = shift;
1072     my %arg  = $self->_valid_params([qw(mb_total mb_used devid)], @_);
1073     eval {
1074         $self->dbh->do("UPDATE device SET mb_total = ?, mb_used = ?, mb_asof = " . $self->unix_timestamp .
1075                        " WHERE devid = ?", undef, $arg{mb_total}, $arg{mb_used}, $arg{devid});
1076     };
1077     $self->condthrow;
1078 }
1079
1080 # This is unimplemented at the moment as we must verify:
1081 # - no file_on rows exist
1082 # - nothing in file_to_queue is going to attempt to use it
1083 # - nothing in file_to_replicate is going to attempt to use it
1084 # - it's already been marked dead
1085 # - that all trackers are likely to know this :/
1086 # - ensure the devid can't be reused
1087 # IE; the user can't mark it dead then remove it all at once and cause their
1088 # cluster to implode.
1089 sub delete_device {
1090     die "Unimplemented; needs further testing";
1091 }
1092
1093 sub mark_fidid_unreachable {
1094     my ($self, $fidid) = @_;
1095     die "Your database does not support REPLACE! Reimplement mark_fidid_unreachable!" unless $self->can_replace;
1096     $self->dbh->do("REPLACE INTO unreachable_fids VALUES (?, " . $self->unix_timestamp . ")",
1097                    undef, $fidid);
1098 }
1099
1100 sub set_device_weight {
1101     my ($self, $devid, $weight) = @_;
1102     eval {
1103         $self->dbh->do('UPDATE device SET weight = ? WHERE devid = ?', undef, $weight, $devid);
1104     };
1105     $self->condthrow;
1106 }
1107
1108 sub set_device_state {
1109     my ($self, $devid, $state) = @_;
1110     eval {
1111         $self->dbh->do('UPDATE device SET status = ? WHERE devid = ?', undef, $state, $devid);
1112     };
1113     $self->condthrow;
1114 }
1115
1116 sub delete_class {
1117     my ($self, $dmid, $cid) = @_;
1118     throw("has_files") if $self->class_has_files($dmid, $cid);
1119     eval {
1120         $self->dbh->do("DELETE FROM class WHERE dmid = ? AND classid = ?", undef, $dmid, $cid);
1121     };
1122     $self->condthrow;
1123 }
1124
1125 sub delete_fidid {
1126     my ($self, $fidid) = @_;
1127     eval { $self->dbh->do("DELETE FROM file WHERE fid=?", undef, $fidid); };
1128     $self->condthrow;
1129     eval { $self->dbh->do("DELETE FROM tempfile WHERE fid=?", undef, $fidid); };
1130     $self->condthrow;
1131     $self->enqueue_for_delete2($fidid, 0);
1132     $self->condthrow;
1133 }
1134
1135 sub delete_tempfile_row {
1136     my ($self, $fidid) = @_;
1137     my $rv = eval { $self->dbh->do("DELETE FROM tempfile WHERE fid=?", undef, $fidid); };
1138     $self->condthrow;
1139     return $rv;
1140 }
1141
1142 # Load the specified tempfile, then delete it.  If we succeed, we were
1143 # here first; otherwise, someone else beat us here (and we return undef)
1144 sub delete_and_return_tempfile_row {
1145     my ($self, $fidid) = @_;
1146     my $rv = $self->tempfile_row_from_fid($fidid);
1147     my $rows_deleted = $self->delete_tempfile_row($fidid);
1148     return $rv if ($rows_deleted > 0);
1149 }
1150
1151 sub replace_into_file {
1152     my $self = shift;
1153     my %arg  = $self->_valid_params([qw(fidid dmid key length classid)], @_);
1154     die "Your database does not support REPLACE! Reimplement replace_into_file!" unless $self->can_replace;
1155     eval {
1156         $self->dbh->do("REPLACE INTO file (fid, dmid, dkey, length, classid, devcount) ".
1157                        "VALUES (?,?,?,?,?,0) ", undef,
1158                        @arg{'fidid', 'dmid', 'key', 'length', 'classid'});
1159     };
1160     $self->condthrow;
1161 }
1162
1163 # returns 1 on success, 0 on duplicate key error, dies on exception
1164 # TODO: need a test to hit the duplicate name error condition
1165 # TODO: switch to using "dup" exception here?
1166 sub rename_file {
1167     my ($self, $fidid, $to_key) = @_;
1168     my $dbh = $self->dbh;
1169     eval {
1170         $dbh->do('UPDATE file SET dkey = ? WHERE fid=?',
1171                  undef, $to_key, $fidid);
1172     };
1173     if ($@ || $dbh->err) {
1174         # first is MySQL's error code for duplicates
1175         if ($self->was_duplicate_error) {
1176             return 0;
1177         } else {
1178             die $@;
1179         }
1180     }
1181     $self->condthrow;
1182     return 1;
1183 }
1184
1185 sub get_domainid_by_name {
1186     my $self = shift;
1187     my ($dmid) = $self->dbh->selectrow_array('SELECT dmid FROM domain WHERE namespace = ?',
1188         undef, $_[0]);
1189     return $dmid;
1190 }
1191
1192 # returns a hash of domains. Key is namespace, value is dmid.
1193 sub get_all_domains {
1194     my ($self) = @_;
1195     my $domains = $self->dbh->selectall_arrayref('SELECT namespace, dmid FROM domain');
1196     return map { ($_->[0], $_->[1]) } @{$domains || []};
1197 }
1198
1199 sub get_classid_by_name {
1200     my $self = shift;
1201     my ($classid) = $self->dbh->selectrow_array('SELECT classid FROM class WHERE dmid = ? AND classname = ?',
1202         undef, $_[0], $_[1]);
1203     return $classid;
1204 }
1205
1206 # returns an array of hashrefs, one hashref per row in the 'class' table
1207 sub get_all_classes {
1208     my ($self) = @_;
1209     my (@ret, $row);
1210
1211     my $repl_col = "";
1212     if ($self->cached_schema_version >= 10) {
1213         $repl_col = ", replpolicy";
1214     }
1215
1216     my $sth = $self->dbh->prepare("SELECT dmid, classid, classname, mindevcount $repl_col FROM class");
1217     $sth->execute;
1218     push @ret, $row while $row = $sth->fetchrow_hashref;
1219     return @ret;
1220 }
1221
1222 # add a record of fidid existing on devid
1223 # returns 1 on success, 0 on duplicate
1224 sub add_fidid_to_devid {
1225     my ($self, $fidid, $devid) = @_;
1226     croak("fidid not non-zero") unless $fidid;
1227     croak("devid not non-zero") unless $devid;
1228
1229     # TODO: This should possibly be insert_ignore instead
1230     # As if we are adding an extra file_on entry, we do not want to replace the
1231     # exist one. Check REPLACE semantics.
1232     my $rv = $self->dowell($self->ignore_replace . " INTO file_on (fid, devid) VALUES (?,?)",
1233                            undef, $fidid, $devid);
1234     return 1 if $rv > 0;
1235     return 0;
1236 }
1237
1238 # remove a record of fidid existing on devid
1239 # returns 1 on success, 0 if not there anyway
1240 sub remove_fidid_from_devid {
1241     my ($self, $fidid, $devid) = @_;
1242     my $rv = eval { $self->dbh->do("DELETE FROM file_on WHERE fid=? AND devid=?",
1243                             undef, $fidid, $devid); };
1244     $self->condthrow;
1245     return $rv;
1246 }
1247
1248 # Test if host exists.
1249 sub get_hostid_by_id {
1250     my $self = shift;
1251     my ($hostid) = $self->dbh->selectrow_array('SELECT hostid FROM host WHERE hostid = ?',
1252         undef, $_[0]);
1253     return $hostid;
1254 }
1255
1256 sub get_hostid_by_name {
1257     my $self = shift;
1258     my ($hostid) = $self->dbh->selectrow_array('SELECT hostid FROM host WHERE hostname = ?',
1259         undef, $_[0]);
1260     return $hostid;
1261 }
1262
1263 # get all hosts from database, returns them as list of hashrefs, hashrefs being the row contents.
1264 sub get_all_hosts {
1265     my ($self) = @_;
1266     my $sth = $self->dbh->prepare("SELECT /*!40000 SQL_CACHE */ hostid, status, hostname, " .
1267                                   "hostip, http_port, http_get_port, altip, altmask FROM host");
1268     $sth->execute;
1269     my @ret;
1270     while (my $row = $sth->fetchrow_hashref) {
1271         push @ret, $row;
1272     }
1273     return @ret;
1274 }
1275
1276 # get all devices from database, returns them as list of hashrefs, hashrefs being the row contents.
1277 sub get_all_devices {
1278     my ($self) = @_;
1279     my $sth = $self->dbh->prepare("SELECT /*!40000 SQL_CACHE */ devid, hostid, mb_total, " .
1280                                   "mb_used, mb_asof, status, weight FROM device");
1281     $self->condthrow;
1282     $sth->execute;
1283     my @return;
1284     while (my $row = $sth->fetchrow_hashref) {
1285         push @return, $row;
1286     }
1287     return @return;
1288 }
1289
1290 # update the device count for a given fidid
1291 sub update_devcount {
1292     my ($self, $fidid) = @_;
1293     my $dbh = $self->dbh;
1294     my $ct = $dbh->selectrow_array("SELECT COUNT(*) FROM file_on WHERE fid=?",
1295                                    undef, $fidid);
1296
1297     eval { $dbh->do("UPDATE file SET devcount=? WHERE fid=?", undef,
1298               $ct, $fidid); };
1299     $self->condthrow;
1300
1301     return 1;
1302 }
1303
1304 # update the classid for a given fidid
1305 sub update_classid {
1306     my ($self, $fidid, $classid) = @_;
1307     my $dbh = $self->dbh;
1308
1309     $dbh->do("UPDATE file SET classid=? WHERE fid=?", undef,
1310               $classid, $fidid);
1311
1312     $self->condthrow;
1313     return 1;
1314 }
1315
1316 # enqueue a fidid for replication, from a specific deviceid (can be undef), in a given number of seconds.
1317 sub enqueue_for_replication {
1318     my ($self, $fidid, $from_devid, $in) = @_;
1319
1320     $in = 0 unless $in;
1321     my $nexttry = $self->unix_timestamp . " + " . int($in);
1322
1323     $self->retry_on_deadlock(sub {
1324         $self->insert_ignore("INTO file_to_replicate (fid, fromdevid, nexttry) ".
1325                              "VALUES (?,?,$nexttry)", undef, $fidid, $from_devid);
1326     });
1327 }
1328
1329 # enqueue a fidid for delete
1330 # note: if we get one more "independent" queue like this, the
1331 # code should be collapsable? I tried once and it looked too ugly, so we have
1332 # some redundancy.
1333 sub enqueue_for_delete2 {
1334     my ($self, $fidid, $in) = @_;
1335
1336     $in = 0 unless $in;
1337     my $nexttry = $self->unix_timestamp . " + " . int($in);
1338
1339     $self->retry_on_deadlock(sub {
1340         $self->insert_ignore("INTO file_to_delete2 (fid, nexttry) ".
1341                              "VALUES (?,$nexttry)", undef, $fidid);
1342     });
1343 }
1344
1345 # enqueue a fidid for work
1346 sub enqueue_for_todo {
1347     my ($self, $fidid, $type, $in) = @_;
1348
1349     $in = 0 unless $in;
1350     my $nexttry = $self->unix_timestamp . " + " . int($in);
1351
1352     $self->retry_on_deadlock(sub {
1353         if (ref($fidid)) {
1354             $self->insert_ignore("INTO file_to_queue (fid, devid, arg, type, ".
1355                                  "nexttry) VALUES (?,?,?,?,$nexttry)", undef,
1356                                  $fidid->[0], $fidid->[1], $fidid->[2], $type);
1357         } else {
1358             $self->insert_ignore("INTO file_to_queue (fid, type, nexttry) ".
1359                                  "VALUES (?,?,$nexttry)", undef, $fidid, $type);
1360         }
1361     });
1362 }
1363
1364 # return 1 on success.  die otherwise.
1365 sub enqueue_many_for_todo {
1366     my ($self, $fidids, $type, $in) = @_;
1367     if (! ($self->can_insert_multi && ($self->can_replace || $self->can_insertignore))) {
1368         $self->enqueue_for_todo($_, $type, $in) foreach @$fidids;
1369         return 1;
1370     }
1371
1372     $in = 0 unless $in;
1373     my $nexttry = $self->unix_timestamp . " + " . int($in);
1374
1375     # TODO: convert to prepared statement?
1376     $self->retry_on_deadlock(sub {
1377         if (ref($fidids->[0]) eq 'ARRAY') {
1378             my $sql =  $self->ignore_replace .
1379                 "INTO file_to_queue (fid, devid, arg, type, nexttry) VALUES ".
1380                 join(', ', ('(?,?,?,?,?)') x scalar @$fidids);
1381             $self->dbh->do($sql, undef, map { @$_, $type, $nexttry } @$fidids);
1382         } else {
1383             $self->dbh->do($self->ignore_replace . " INTO file_to_queue (fid, type,
1384             nexttry) VALUES " .
1385             join(",", map { "(" . int($_) . ", $type, $nexttry)" } @$fidids));
1386         }
1387     });
1388     $self->condthrow;
1389 }
1390
1391 # For file_to_queue queues that should be kept small, find the size.
1392 # This isn't fast, but for small queues won't be slow, and is usually only ran
1393 # from a single tracker.
1394 sub file_queue_length {
1395     my $self = shift;
1396     my $type = shift;
1397
1398     return $self->dbh->selectrow_array("SELECT COUNT(*) FROM file_to_queue " .
1399            "WHERE type = ?", undef, $type);
1400 }
1401
1402 # reschedule all deferred replication, return number rescheduled
1403 sub replicate_now {
1404     my ($self) = @_;
1405
1406     $self->retry_on_deadlock(sub {
1407         return $self->dbh->do("UPDATE file_to_replicate SET nexttry = " . $self->unix_timestamp .
1408                               " WHERE nexttry > " . $self->unix_timestamp);
1409     });
1410 }
1411
1412 # takes two arguments, devid and limit, both required. returns an arrayref of fidids.
1413 sub get_fidids_by_device {
1414     my ($self, $devid, $limit) = @_;
1415
1416     my $dbh = $self->dbh;
1417     my $fidids = $dbh->selectcol_arrayref("SELECT fid FROM file_on WHERE devid = ? LIMIT $limit",
1418                                           undef, $devid);
1419     return $fidids;
1420 }
1421
1422 # finds a chunk of fids given a set of constraints:
1423 # devid, fidid, age (new or old), limit
1424 # Note that if this function is very slow on your large DB, you're likely
1425 # sorting by "newfiles" and are missing a new index.
1426 # returns an arrayref of fidids
1427 sub get_fidid_chunks_by_device {
1428     my ($self, %o) = @_;
1429
1430     my $dbh = $self->dbh;
1431     my $devid = delete $o{devid};
1432     croak("must supply at least a devid") unless $devid;
1433     my $age   = delete $o{age};
1434     my $fidid = delete $o{fidid};
1435     my $limit = delete $o{limit};
1436     croak("invalid options: " . join(', ', keys %o)) if %o;
1437     # If supplied a "previous" fidid, we're paging through.
1438     my $fidsort = '';
1439     my $order   = '';
1440     $age ||= 'old';
1441     if ($age eq 'old') {
1442         $fidsort = 'AND fid > ?' if $fidid;
1443         $order   = 'ASC';
1444     } elsif ($age eq 'new') {
1445         $fidsort = 'AND fid < ?' if $fidid;
1446         $order   = 'DESC';
1447     } else {
1448         croak("invalid age argument: " . $age);
1449     }
1450     $limit ||= 100;
1451     my @extra = ();
1452     push @extra, $fidid if $fidid;
1453
1454     my $fidids = $dbh->selectcol_arrayref("SELECT fid FROM file_on WHERE devid = ? " .
1455         $fidsort . " ORDER BY fid $order LIMIT $limit", undef, $devid, @extra);
1456     return $fidids;
1457 }
1458
1459 # takes two arguments, fidid to be above, and optional limit (default
1460 # 1,000).  returns up to that that many fidids above the provided
1461 # fidid.  returns array of MogileFS::FID objects, sorted by fid ids.
1462 sub get_fids_above_id {
1463     my ($self, $fidid, $limit) = @_;
1464     $limit ||= 1000;
1465     $limit = int($limit);
1466
1467     my @ret;
1468     my $dbh = $self->dbh;
1469     my $sth = $dbh->prepare("SELECT fid, dmid, dkey, length, classid, devcount ".
1470                             "FROM   file ".
1471                             "WHERE  fid > ? ".
1472                             "ORDER BY fid LIMIT $limit");
1473     $sth->execute($fidid);
1474     while (my $row = $sth->fetchrow_hashref) {
1475         push @ret, MogileFS::FID->new_from_db_row($row);
1476     }
1477     return @ret;
1478 }
1479
1480 # Same as above, but returns unblessed hashref.
1481 sub get_fidids_above_id {
1482     my ($self, $fidid, $limit) = @_;
1483     $limit ||= 1000;
1484     $limit = int($limit);
1485
1486     my $dbh = $self->dbh;
1487     my $fidids = $dbh->selectcol_arrayref(qq{SELECT fid FROM file WHERE fid > ?
1488         ORDER BY fid LIMIT $limit}, undef, $fidid);
1489     return $fidids;
1490 }
1491
1492 # creates a new domain, given a domain namespace string.  return the dmid on success,
1493 # throw 'dup' on duplicate name.
1494 # override if you want a less racy version.
1495 sub create_domain {
1496     my ($self, $name) = @_;
1497     my $dbh = $self->dbh;
1498
1499     # get the max domain id
1500     my $maxid = $dbh->selectrow_array('SELECT MAX(dmid) FROM domain') || 0;
1501     my $rv = eval {
1502         $dbh->do('INSERT INTO domain (dmid, namespace) VALUES (?, ?)',
1503                  undef, $maxid + 1, $name);
1504     };
1505     if ($self->was_duplicate_error) {
1506         throw("dup");
1507     }
1508     return $maxid+1 if $rv;
1509     die "failed to make domain";  # FIXME: the above is racy.
1510 }
1511
1512 sub update_host {
1513     my ($self, $hid, $to_update) = @_;
1514     my @keys = sort keys %$to_update;
1515     return unless @keys;
1516     $self->conddup(sub {
1517         $self->dbh->do("UPDATE host SET " . join('=?, ', @keys)
1518             . "=? WHERE hostid=?", undef, (map { $to_update->{$_} } @keys),
1519             $hid);
1520     });
1521     return 1;
1522 }
1523
1524 sub update_host_property {
1525     my ($self, $hostid, $col, $val) = @_;
1526     $self->conddup(sub {
1527         $self->dbh->do("UPDATE host SET $col=? WHERE hostid=?", undef, $val, $hostid);
1528     });
1529     return 1;
1530 }
1531
1532 # return ne hostid, or throw 'dup' on error.
1533 # NOTE: you need to put them into the initial 'down' state.
1534 sub create_host {
1535     my ($self, $hostname, $ip) = @_;
1536     my $dbh = $self->dbh;
1537     # racy! lazy. no, better: portable! how often does this happen? :)
1538     my $hid = ($dbh->selectrow_array('SELECT MAX(hostid) FROM host') || 0) + 1;
1539     my $rv = $self->conddup(sub {
1540         $dbh->do("INSERT INTO host (hostid, hostname, hostip, status) ".
1541                  "VALUES (?, ?, ?, 'down')",
1542                  undef, $hid, $hostname, $ip);
1543     });
1544     return $hid if $rv;
1545     die "db failure";
1546 }
1547
1548 # return array of row hashrefs containing columns: (fid, fromdevid,
1549 # failcount, flags, nexttry)
1550 sub files_to_replicate {
1551     my ($self, $limit) = @_;
1552     my $ut = $self->unix_timestamp;
1553     my $to_repl_map = $self->dbh->selectall_hashref(qq{
1554         SELECT fid, fromdevid, failcount, flags, nexttry
1555         FROM file_to_replicate
1556         WHERE nexttry <= $ut
1557         ORDER BY nexttry
1558         LIMIT $limit
1559     }, "fid") or return ();
1560     return values %$to_repl_map;
1561 }
1562
1563 # "new" style queue consumption code.
1564 # from within a transaction, fetch a limit of fids,
1565 # then update each fid's nexttry to be off in the future,
1566 # giving local workers some time to dequeue the items.
1567 # Note:
1568 # DBI (even with RaiseError) returns weird errors on
1569 # deadlocks from selectall_hashref. So we can't do that.
1570 # we also used to retry on deadlock within the routine,
1571 # but instead lets return undef and let job_master retry.
1572 sub grab_queue_chunk {
1573     my $self      = shift;
1574     my $queue     = shift;
1575     my $limit     = shift;
1576     my $extfields = shift;
1577
1578     my $dbh = $self->dbh;
1579     my $tries = 3;
1580     my $work;
1581
1582     my $extwhere = shift || '';
1583     my $fields = 'fid, nexttry, failcount';
1584     $fields .= ', ' . $extfields if $extfields;
1585     eval {
1586         $dbh->begin_work;
1587         my $ut  = $self->unix_timestamp;
1588         my $query = qq{
1589             SELECT $fields
1590             FROM $queue
1591             WHERE nexttry <= $ut
1592             $extwhere
1593             ORDER BY nexttry
1594             LIMIT $limit
1595         };
1596         $query .= "FOR UPDATE\n" if $self->can_for_update;
1597         my $sth = $dbh->prepare($query);
1598         $sth->execute;
1599         $work = $sth->fetchall_hashref('fid');
1600         # Nothing to work on.
1601         # Now claim the fids for a while.
1602         # TODO: Should be configurable... but not necessary.
1603         my $fidlist = join(',', keys %$work);
1604         unless ($fidlist) { $dbh->commit; return; }
1605         $dbh->do("UPDATE $queue SET nexttry = $ut + 1000 WHERE fid IN ($fidlist)");
1606         $dbh->commit;
1607     };
1608     if ($self->was_deadlock_error) {
1609         eval { $dbh->rollback };
1610         return ();
1611     }
1612     $self->condthrow;
1613
1614     return defined $work ? values %$work : ();
1615 }
1616
1617 sub grab_files_to_replicate {
1618     my ($self, $limit) = @_;
1619     return $self->grab_queue_chunk('file_to_replicate', $limit,
1620         'fromdevid, flags');
1621 }
1622
1623 sub grab_files_to_delete2 {
1624     my ($self, $limit) = @_;
1625     return $self->grab_queue_chunk('file_to_delete2', $limit);
1626 }
1627
1628 # $extwhere is ugly... but should be fine.
1629 sub grab_files_to_queued {
1630     my ($self, $type, $what, $limit) = @_;
1631     $what ||= 'type, flags';
1632     return $self->grab_queue_chunk('file_to_queue', $limit,
1633         $what, 'AND type = ' . $type);
1634 }
1635
1636 # although it's safe to have multiple tracker hosts and/or processes
1637 # replicating the same file, around, it's inefficient CPU/time-wise,
1638 # and it's also possible they pick different places and waste disk.
1639 # so the replicator asks the store interface when it's about to start
1640 # and when it's done replicating a fidid, so you can do something smart
1641 # and tell it not to.
1642 sub should_begin_replicating_fidid {
1643     my ($self, $fidid) = @_;
1644     warn("Inefficient implementation of should_begin_replicating_fidid() in $self!\n");
1645     1;
1646 }
1647
1648 # called when replicator is done replicating a fid, so you can cleanup
1649 # whatever you did in 'should_begin_replicating_fidid' above.
1650 #
1651 # NOTE: there's a theoretical race condition in the rebalance code,
1652 # where (without locking as provided by
1653 # should_begin_replicating_fidid/note_done_replicating), all copies of
1654 # a file can be deleted by independent replicators doing rebalancing
1655 # in different ways.  so you'll probably want to implement some
1656 # locking in this pair of functions.
1657 sub note_done_replicating {
1658     my ($self, $fidid) = @_;
1659 }
1660
1661 sub find_fid_from_file_to_replicate {
1662     my ($self, $fidid) = @_;
1663     return $self->dbh->selectrow_hashref("SELECT fid, nexttry, fromdevid, failcount, flags FROM file_to_replicate WHERE fid = ?",
1664         undef, $fidid);
1665 }
1666
1667 sub find_fid_from_file_to_delete2 {
1668     my ($self, $fidid) = @_;
1669     return $self->dbh->selectrow_hashref("SELECT fid, nexttry, failcount FROM file_to_delete2 WHERE fid = ?",
1670         undef, $fidid);
1671 }
1672
1673 sub find_fid_from_file_to_queue {
1674     my ($self, $fidid, $type) = @_;
1675     return $self->dbh->selectrow_hashref("SELECT fid, devid, type, nexttry, failcount, flags, arg FROM file_to_queue WHERE fid = ? AND type = ?",
1676         undef, $fidid, $type);
1677 }
1678
1679 sub delete_fid_from_file_to_replicate {
1680     my ($self, $fidid) = @_;
1681     $self->retry_on_deadlock(sub {
1682         $self->dbh->do("DELETE FROM file_to_replicate WHERE fid=?", undef, $fidid);
1683     });
1684 }
1685
1686 sub delete_fid_from_file_to_queue {
1687     my ($self, $fidid, $type) = @_;
1688     $self->retry_on_deadlock(sub {
1689         $self->dbh->do("DELETE FROM file_to_queue WHERE fid=? and type=?",
1690             undef, $fidid, $type);
1691     });
1692 }
1693
1694 sub delete_fid_from_file_to_delete2 {
1695     my ($self, $fidid) = @_;
1696     $self->retry_on_deadlock(sub {
1697         $self->dbh->do("DELETE FROM file_to_delete2 WHERE fid=?", undef, $fidid);
1698     });
1699 }
1700
1701 sub reschedule_file_to_replicate_absolute {
1702     my ($self, $fid, $abstime) = @_;
1703     $self->retry_on_deadlock(sub {
1704         $self->dbh->do("UPDATE file_to_replicate SET nexttry = ?, failcount = failcount + 1 WHERE fid = ?",
1705                        undef, $abstime, $fid);
1706     });
1707 }
1708
1709 sub reschedule_file_to_replicate_relative {
1710     my ($self, $fid, $in_n_secs) = @_;
1711     $self->retry_on_deadlock(sub {
1712         $self->dbh->do("UPDATE file_to_replicate SET nexttry = " . $self->unix_timestamp . " + ?, " .
1713                        "failcount = failcount + 1 WHERE fid = ?",
1714                        undef, $in_n_secs, $fid);
1715     });
1716 }
1717
1718 sub reschedule_file_to_delete2_absolute {
1719     my ($self, $fid, $abstime) = @_;
1720     $self->retry_on_deadlock(sub {
1721         $self->dbh->do("UPDATE file_to_delete2 SET nexttry = ?, failcount = failcount + 1 WHERE fid = ?",
1722                        undef, $abstime, $fid);
1723     });
1724 }
1725
1726 sub reschedule_file_to_delete2_relative {
1727     my ($self, $fid, $in_n_secs) = @_;
1728     $self->retry_on_deadlock(sub {
1729         $self->dbh->do("UPDATE file_to_delete2 SET nexttry = " . $self->unix_timestamp . " + ?, " .
1730                        "failcount = failcount + 1 WHERE fid = ?",
1731                        undef, $in_n_secs, $fid);
1732     });
1733 }
1734
1735 # Given a dmid prefix after and limit, return an arrayref of dkey from the file
1736 # table.
1737 sub get_keys_like {
1738     my ($self, $dmid, $prefix, $after, $limit) = @_;
1739     # fix the input... prefix always ends with a % so that it works
1740     # in a LIKE call, and after is either blank or something
1741     $prefix = '' unless defined $prefix;
1742     $prefix .= '%';
1743     $after  = '' unless defined $after;
1744
1745     # now select out our keys
1746     return $self->dbh->selectcol_arrayref
1747         ('SELECT dkey FROM file WHERE dmid = ? AND dkey LIKE ? AND dkey > ? ' .
1748          "ORDER BY dkey LIMIT $limit", undef, $dmid, $prefix, $after);
1749 }
1750
1751 # return arrayref of all tempfile rows (themselves also arrayrefs, of [$fidid, $devids])
1752 # that were created $secs_ago seconds ago or older.
1753 sub old_tempfiles {
1754     my ($self, $secs_old) = @_;
1755     return $self->dbh->selectall_arrayref("SELECT fid, devids FROM tempfile " .
1756                                           "WHERE createtime < " . $self->unix_timestamp . " - $secs_old LIMIT 50");
1757 }
1758
1759 # given an array of MogileFS::DevFID objects, mass-insert them all
1760 # into file_on (ignoring if they're already present)
1761 sub mass_insert_file_on {
1762     my ($self, @devfids) = @_;
1763     return 1 unless @devfids;
1764
1765     if (@devfids > 1 && ! $self->can_insert_multi) {
1766         $self->mass_insert_file_on($_) foreach @devfids;
1767         return 1;
1768     }
1769
1770     my (@qmarks, @binds);
1771     foreach my $df (@devfids) {
1772         my ($fidid, $devid) = ($df->fidid, $df->devid);
1773         Carp::croak("got a false fidid") unless $fidid;
1774         Carp::croak("got a false devid") unless $devid;
1775         push @binds, $fidid, $devid;
1776         push @qmarks, "(?,?)";
1777     }
1778
1779     # TODO: This should possibly be insert_ignore instead
1780     # As if we are adding an extra file_on entry, we do not want to replace the
1781     # exist one. Check REPLACE semantics.
1782     $self->dowell($self->ignore_replace . " INTO file_on (fid, devid) VALUES " . join(',', @qmarks), undef, @binds);
1783     return 1;
1784 }
1785
1786 sub set_schema_vesion {
1787     my ($self, $ver) = @_;
1788     $self->set_server_setting("schema_version", int($ver));
1789 }
1790
1791 # returns array of fidids to try and delete again
1792 sub fids_to_delete_again {
1793     my $self = shift;
1794     my $ut = $self->unix_timestamp;
1795     return @{ $self->dbh->selectcol_arrayref(qq{
1796         SELECT fid
1797          FROM file_to_delete_later
1798         WHERE delafter < $ut
1799         LIMIT 500
1800     }) || [] };
1801 }
1802
1803 # return 1 on success.  die otherwise.
1804 sub enqueue_fids_to_delete {
1805     my ($self, @fidids) = @_;
1806     # multi-row insert-ignore/replace CAN fail with the insert_ignore emulation sub.
1807     # when the first row causes the duplicate error, and the remaining rows are
1808     # not processed.
1809     if (@fidids > 1 && ! ($self->can_insert_multi && ($self->can_replace || $self->can_insertignore))) {
1810         $self->enqueue_fids_to_delete($_) foreach @fidids;
1811         return 1;
1812     }
1813     # TODO: convert to prepared statement?
1814     $self->retry_on_deadlock(sub {
1815         $self->dbh->do($self->ignore_replace . " INTO file_to_delete (fid) VALUES " .
1816                        join(",", map { "(" . int($_) . ")" } @fidids));
1817     });
1818     $self->condthrow;
1819 }
1820
1821 sub enqueue_fids_to_delete2 {
1822     my ($self, @fidids) = @_;
1823     # multi-row insert-ignore/replace CAN fail with the insert_ignore emulation sub.
1824     # when the first row causes the duplicate error, and the remaining rows are
1825     # not processed.
1826     if (@fidids > 1 && ! ($self->can_insert_multi && ($self->can_replace || $self->can_insertignore))) {
1827         $self->enqueue_fids_to_delete2($_) foreach @fidids;
1828         return 1;
1829     }
1830
1831     my $nexttry = $self->unix_timestamp;
1832
1833     # TODO: convert to prepared statement?
1834     $self->retry_on_deadlock(sub {
1835         $self->dbh->do($self->ignore_replace . " INTO file_to_delete2 (fid,
1836         nexttry) VALUES " .
1837                        join(",", map { "(" . int($_) . ", $nexttry)" } @fidids));
1838     });
1839     $self->condthrow;
1840 }
1841
1842 # clears everything from the fsck_log table
1843 # return 1 on success.  die otherwise.
1844 sub clear_fsck_log {
1845     my $self = shift;
1846     $self->dbh->do("DELETE FROM fsck_log");
1847     return 1;
1848 }
1849
1850 # FIXME: Fsck log entries are processed a little out of order.
1851 # Once a fsck has completed, the log should be re-summarized.
1852 sub fsck_log_summarize {
1853     my $self = shift;
1854
1855     my $lockname = 'mgfs:fscksum';
1856     my $lock = eval { $self->get_lock($lockname, 10) };
1857     return 0 if defined $lock && $lock == 0;
1858
1859     my $logid = $self->max_fsck_logid;
1860
1861     # sum-up evcode counts every so often, to make fsck_status faster,
1862     # avoiding a potentially-huge GROUP BY in the future..
1863     my $start_max_logid = $self->server_setting("fsck_start_maxlogid") || 0;
1864     # both inclusive:
1865     my $min_logid = $self->server_setting("fsck_logid_processed") || 0;
1866     $min_logid++;
1867     my $cts = $self->fsck_evcode_counts(logid_range => [$min_logid, $logid]); # inclusive notation :)
1868     while (my ($evcode, $ct) = each %$cts) {
1869         $self->incr_server_setting("fsck_sum_evcount_$evcode", $ct);
1870     }
1871     $self->set_server_setting("fsck_logid_processed", $logid);
1872
1873     $self->release_lock($lockname) if $lock;
1874 }
1875
1876 sub fsck_log {
1877     my ($self, %opts) = @_;
1878     $self->dbh->do("INSERT INTO fsck_log (utime, fid, evcode, devid) ".
1879                    "VALUES (" . $self->unix_timestamp . ",?,?,?)",
1880                    undef,
1881                    delete $opts{fid},
1882                    delete $opts{code},
1883                    delete $opts{devid});
1884     croak("Unknown opts") if %opts;
1885     $self->condthrow;
1886
1887     return 1;
1888 }
1889
1890 sub get_db_unixtime {
1891     my $self = shift;
1892     return $self->dbh->selectrow_array("SELECT " . $self->unix_timestamp);
1893 }
1894
1895 sub max_fidid {
1896     my $self = shift;
1897     return $self->dbh->selectrow_array("SELECT MAX(fid) FROM file");
1898 }
1899
1900 sub max_fsck_logid {
1901     my $self = shift;
1902     return $self->dbh->selectrow_array("SELECT MAX(logid) FROM fsck_log") || 0;
1903 }
1904
1905 # returns array of $row hashrefs, from fsck_log table
1906 sub fsck_log_rows {
1907     my ($self, $after_logid, $limit) = @_;
1908     $limit       = int($limit || 100);
1909     $after_logid = int($after_logid || 0);
1910
1911     my @rows;
1912     my $sth = $self->dbh->prepare(qq{
1913         SELECT logid, utime, fid, evcode, devid
1914         FROM fsck_log
1915         WHERE logid > ?
1916         ORDER BY logid
1917         LIMIT $limit
1918     });
1919     $sth->execute($after_logid);
1920     my $row;
1921     push @rows, $row while $row = $sth->fetchrow_hashref;
1922     return @rows;
1923 }
1924
1925 sub fsck_evcode_counts {
1926     my ($self, %opts) = @_;
1927     my $timegte = delete $opts{time_gte};
1928     my $logr    = delete $opts{logid_range};
1929     die if %opts;
1930
1931     my $ret = {};
1932     my $sth;
1933     if ($timegte) {
1934         $sth = $self->dbh->prepare(qq{
1935             SELECT evcode, COUNT(*) FROM fsck_log
1936             WHERE utime >= ?
1937             GROUP BY evcode
1938          });
1939         $sth->execute($timegte||0);
1940     }
1941     if ($logr) {
1942         $sth = $self->dbh->prepare(qq{
1943             SELECT evcode, COUNT(*) FROM fsck_log
1944             WHERE logid >= ? AND logid <= ?
1945             GROUP BY evcode
1946          });
1947         $sth->execute($logr->[0], $logr->[1]);
1948     }
1949     while (my ($ev, $ct) = $sth->fetchrow_array) {
1950         $ret->{$ev} = $ct;
1951     }
1952     return $ret;
1953 }
1954
1955 # run before daemonizing.  you can die from here if you see something's amiss.  or emit
1956 # warnings.
1957 sub pre_daemonize_checks { }
1958
1959
1960 # attempt to grab a lock of lockname, and timeout after timeout seconds.
1961 # returns 1 on success and 0 on timeout.  dies if more than one lock is already outstanding.
1962 sub get_lock {
1963     my ($self, $lockname, $timeout) = @_;
1964     die "Lock recursion detected (grabbing $lockname, had $self->{last_lock}).  Bailing out." if $self->{lock_depth};
1965     die "get_lock not implemented for $self";
1966 }
1967
1968 # attempt to release a lock of lockname.
1969 # returns 1 on success and 0 if no lock we have has that name.
1970 sub release_lock {
1971     my ($self, $lockname) = @_;
1972     die "release_lock not implemented for $self";
1973 }
1974
1975 # returns up to $limit @fidids which are on provided $devid
1976 sub random_fids_on_device {
1977     my ($self, $devid, $limit) = @_;
1978     $limit = int($limit) || 100;
1979
1980     my $dbh = $self->dbh;
1981
1982     # FIXME: this blows. not random.  and good chances these will
1983     # eventually get to point where they're un-rebalance-able, and we
1984     # never move on past the first 5000
1985     my @some_fids = List::Util::shuffle(@{
1986         $dbh->selectcol_arrayref("SELECT fid FROM file_on WHERE devid=? LIMIT 5000",
1987                                  undef, $devid) || []
1988                                  });
1989
1990     @some_fids = @some_fids[0..$limit-1] if $limit < @some_fids;
1991     return @some_fids;
1992 }
1993
1994 1;
1995
1996 __END__
1997
1998 =head1 NAME
1999
2000 MogileFS::Store - data storage provider.  base class.
2001
2002 =head1 ABOUT
2003
2004 MogileFS aims to be database-independent (though currently as of late
2005 2006 only works with MySQL).  In the future, the server will create a
2006 singleton instance of type "MogileFS::Store", like
2007 L<MogileFS::Store::MySQL>, and all database interaction will be
2008 through it.
2009
2010 =head1 SEE ALSO
2011
2012 L<MogileFS::Store::MySQL>
2013
2014