lib/data_subs.pm

   1 This file contains the code for the subroutines in
   2 Perl-speaks-NONMEMs data module. It is not functional by itself.
   3 The code should be transferred to the module autogenerated by dia2code
   4 using the fill_diacode.pl script.
   5
   6 TODO: Fix the synced attribute so that all methods that change the state of the
   7 data object sets synced=0
   8
   9 # {{{ include
  10
  11 start include statements
  12     use Digest::MD5 'md5_hex';
  13 use OSspecific;
  14 use File::Copy "cp";
  15 use Carp;
  16 use Carp qw(cluck);
  17 use Config;
  18 use Math::Random;
  19 use debug;
  20 use ui;
  21 use Data::Dumper;
  22 use Time::HiRes qw(gettimeofday);
  23 end include
  24
  25 # }}} include statements
  26
  27 start description
  28     # The structure of the data class is subject-centric, recognising that
  29     # the subjects included in a study often can be regarded as
  30     # independent. A class for the subject level exists within PsN and is
  31     # called the individual class. A data object consists of at least one
  32     # but probably many individual objects plus optional comments.
  33 end description
  34
  35 start synopsis
  36     #   use data;
  37     #
  38     #   my $data_obj = data -> new ( filename => 'test040314.dta' );
  39     #
  40     #   $data_obj -> renumber_ascending;
  41     #
  42     #   my $subsets_ref = $data_obj -> case_deletion( bins => 10 );
  43     #
  44     #   my @subsets = @{$subsets_ref};
  45 end synopsis
  46
  47 start see_also
  48     # =begin html
  49     #
  50     # <a HREF="model.html">model</a>, <a HREF="output.html">output</a>,
  51     # <a HREF="tool/modelfit.html">tool::modelfit</a>,
  52     # <a HREF="tool.html">tool</a>
  53     #
  54     # =end html
  55     #
  56     # =begin man
  57     #
  58     # model, output, tool::modelfit, tool
  59     #
  60     # =end man
  61 end see_also
  62
  63
  64 # {{{ new
  65
  66 start new
  67       {
  68         # If the column holding the subject identifier is not the
  69         # first, it can be specified using the I<idcolumn> attribute
  70         #
  71         # I<ignoresign> determines which rows that are regarded as
  72         # comments. Corresponds to the IGNORE= option in the $DATA
  73         # record in a NONMEM model file.
  74
  75         ( $this -> {'directory'},
  76           $this -> {'filename'} ) = OSspecific::absolute_path( $this -> {'directory'},
  77                                                                $this->{'filename'} );
  78
  79         debug -> warn( level => 2,
  80                        message => "data -> new: Data object initialized from file: ".
  81                        $this -> full_name );
  82
  83         sub register_in_database {
  84           # Backslashes messes up the sql syntax
  85           my $file_str = $this->{'filename'};
  86           my $dir_str = $this->{'directory'};
  87           $file_str =~ s/\\/\//g;
  88           $dir_str =~ s/\\/\//g;
  89
  90           # md5sum
  91           my $md5sum = md5_hex(OSspecific::slurp_file($this-> full_name ));
  92           my $dbh = DBI -> connect("DBI:mysql:host=localhost;databse=psn",
  93                                    "psn", "psn_test",
  94                                    {'RaiseError' => 1});
  95           my $sth;
  96           my $sth = $dbh -> prepare( "SELECT data_id FROM psn.data ".
  97                                      "WHERE filename = '$file_str' AND ".
  98                                      "directory = '$dir_str' AND ".
  99                                      "md5sum = '".$md5sum."'" );
 100           $sth -> execute or debug -> die( message => $sth->errstr ) ;
 101           my $select_arr = $sth -> fetchall_arrayref;
 102           if ( scalar @{$select_arr} > 0 ) {
 103             debug -> warn( level   => 1,
 104                            message => "Found an old entry in the database matching the ".
 105                            "current data file" );
 106             if ( scalar @{$select_arr} > 1 ) {
 107               debug -> warn( level   => 1,
 108                              message => "Found more than one matching entry in database".
 109                              ", using the first" );
 110             }
 111             $this -> {'data_id'} = $select_arr->[0][0];
 112           } else {
 113             my ( $date_cmd, $time_cmd );
 114             if( $Config{osname} eq 'MSWin32' ){
 115               $date_cmd = 'date /T';
 116               $time_cmd = 'time /T';
 117             } else {
 118               # Assuming UNIX
 119               $date_cmd = 'date';
 120               $time_cmd = 'time';
 121             }
 122
 123             my $date_str = `$date_cmd`;
 124             my $time_str = `$time_cmd`;
 125             chomp($date_str);
 126             chomp($time_str);
 127             my $date_time = "$date_str $time_str";
 128
 129             $sth = $dbh -> prepare("INSERT INTO psn.data (filename,date,directory,md5sum) ".
 130                                    "VALUES ('$file_str', '$date_time', '$dir_str','".
 131                                    $md5sum."' )");
 132             $sth -> execute;
 133             $this -> {'data_id'} = $sth->{'mysql_insertid'};
 134           }
 135           $sth -> finish;
 136           $dbh -> disconnect;
 137         }
 138
 139
 140         unless ( ( defined $this -> {'header'} and
 141                    scalar @{$this -> {'header'}} > 0 ) or
 142                  ( defined $this -> {'individuals'} and
 143                    scalar @{$this -> {'individuals'}} > 0 ) ) {
 144           if ( -e $this -> full_name ) {
 145             if ( $this -> {'target'} eq 'mem' ) {
 146               &register_in_database if ( $PsN::config -> {'_'} -> {'use_database'} );
 147               $this -> _read_header;
 148               $this -> _read_individuals;
 149               $this -> {'synced'} = 1;
 150             } else {
 151               $this -> {'synced'} = 0;
 152             }
 153           } else {
 154             debug -> die(message => "Error in data -> new: No header, individuals, and no file " . $this -> full_name . " on disk.")
 155               unless $this -> {'ignore_missing_files'};
 156             $this ->  {'synced'} = 0;
 157           }
 158         } else {
 159 #         if ( defined $this -> {'header'} and
 160 #              scalar @{$this -> {'header'}} > 0 ) {
 161 #           if ( -e $this -> full_name ) {
 162 #             my @given_header =  @{$this -> {'header'}};
 163 # #           $this -> _read_header;
 164 #             print "HEPP\n";
 165 #             foreach my $col ( @{$this -> {'header'}} ) {
 166 #               my $given_col = shift(@given_header);
 167 #               print "DATA $col, MODEL $given_col\n" if $this -> {'debug'};
 168 #               #               unless( $col eq $given_col ){
 169 #               #                 die "data::new: Given header does not match header in file on disk\n";
 170 #               #               }
 171 #             }
 172 #           }
 173 #         }
 174
 175           if ( $this -> {'target'} eq 'mem') {
 176             if ( -e $this -> {'filename'} ) {
 177               $this -> _read_header;
 178               &register_in_database if ( $PsN::config -> {'_'} -> {'use_database'} );
 179               $this -> _read_individuals;
 180               $this -> {'synced'} = 1;
 181             } else {
 182               confess "Error in data -> new: No file:".$this->{'filename'}." on disk\n"
 183                 unless $this -> {'ignore_missing_files'};
 184               $this -> {'synced'} = 0;
 185             }
 186           } else {
 187             $this -> flush;
 188           }
 189         }
 190
 191         if ( $this -> {'synced'} ) {
 192           my $i = 1;
 193           foreach my $head ( @{$this -> {'header'}} ) {
 194             $this -> {'column_head_indices'} -> {$head} = $i;
 195             $i++;
 196           }
 197         }
 198       }
 199 end new
 200
 201 # }}} new
 202
 203 # {{{ full_name
 204 start full_name
 205       {
 206         $full_name = $self -> {'directory'} . $self -> {'filename'};
 207       }
 208 end full_name
 209 # }}}
 210
 211 # {{{ bootstrap
 212
 213 start bootstrap
 214       {
 215         # The bootstrap method draws I<samples> number of boostrap
 216         # samples from the data set. The I<subjects> arguments
 217         # determines the size of each sample (default equals to the
 218         # number of individuals in the original data set). The method
 219         # returns references to three arrays: I<boot_samples_ref>,
 220         # which holds the bootstrap data sets, I<incl_individuals_ref>
 221         # which holds arrays containing the subject identifiers (ID's)
 222         # for the included individuals of each bootstrap data set and
 223         # I<included_keys_ref> which holds the key or index of the
 224         # included individuals. The key or index is an integer
 225         # starting at 1 for the first individual in the original data
 226         # set and increasing by one for each following.
 227         $self -> synchronize;
 228         my @header      = @{$self -> {'header'}};
 229         my $individuals = $self -> {'individuals'};
 230         my $key_ref;
 231         for ( my $i = 1; $i <= $samples; $i++ ) {
 232           my $new_name = defined $name_stub ? $name_stub."_$i.dta" : "bs$i.dta";
 233           $new_name = $directory.'/'.$new_name;
 234           my ( $boot, $incl_ind_ref, $incl_key_ref ) =
 235             $self -> resample( subjects    => $subjects,
 236                                resume      => $resume,
 237                                new_name    => $new_name,
 238                                target      => $target,
 239                                stratify_on => $stratify_on );
 240           push( @included_keys, $incl_key_ref );
 241           push( @incl_individuals, $incl_ind_ref );
 242 #         $boot -> renumber_ascending;
 243           push( @boot_samples, $boot );
 244 #          $boot -> synchronize;
 245 #          $boot -> flush;
 246         }
 247       }
 248 end bootstrap
 249
 250 # }}} bootstrap
 251
 252 # {{{ resample
 253
 254 start resample
 255       {
 256         $self -> synchronize;
 257         my ( @header, $individuals, @bs_inds, $key_ref );
 258         if ( defined $stratify_on ) {
 259           unless ( $resume and -e $new_name ) {
 260             @header = @{$self -> {'header'}};
 261             $individuals = $self -> {'individuals'};
 262             my %strata;
 263             if( $stratify_on =~ /\D/ ){
 264               %strata = %{$self -> factors( column_head => $stratify_on )};
 265               if ( $strata{'Non-unique values found'} eq '1' ) {
 266                 debug -> die( message => "Individuals were found to have multiple values in the $stratify_on column. ".
 267                               "The column $stratify_on cannot be used for stratification of the resampling." );
 268               }
 269             } else {
 270               %strata = %{$self -> factors( column => $stratify_on )};
 271               if ( $strata{'Non-unique values found'} eq '1' ) {
 272                 debug -> die( message => "Individuals were found to have multiple values in column number $stratify_on. ".
 273                               "Column $stratify_on cannot be used for stratification of the resampling." );
 274               }
 275             }
 276
 277             while( my ( $factor, $key_list ) = each %strata ) {
 278               my $keys = scalar @{$key_list};
 279               for ( my $i = 0; $i < $keys; $i++ ) {
 280                 my $list_ref = random_uniform_integer(1,0,$keys-1);
 281                 push( @bs_inds, $individuals ->
 282                       [ $key_list -> [$list_ref] ] -> copy );
 283                 push( @included_keys, $key_list -> [$list_ref] );
 284                 push( @incl_individuals, $individuals ->
 285                       [ $key_list -> [$list_ref] ] -> idnumber );
 286               }
 287             }
 288
 289             $boot = data -> new( header      => \@header,
 290                                  idcolumn    => $self -> {'idcolumn'},
 291                                  ignoresign  => $self -> {'ignoresign'},
 292                                  individuals => \@bs_inds,
 293                                  filename    => $new_name,
 294                                  ignore_missing_files => 1,
 295                                  target      => 'mem' );
 296             $boot -> renumber_ascending;
 297             $boot -> _write;
 298             $boot -> target( $target );
 299           }
 300         } else {
 301           unless ( $resume and -e $new_name ) {
 302             @header = @{$self -> {'header'}};
 303             $individuals = $self -> {'individuals'};
 304             for ( my $i = 1; $i <= $subjects; $i++ ) {
 305               $key_ref = random_uniform_integer(1,0,scalar @{$individuals}-1);
 306               push( @bs_inds, $individuals -> [ $key_ref ] -> copy );
 307               push( @included_keys, $key_ref );
 308               push( @incl_individuals, $individuals -> [ $key_ref ] -> idnumber );
 309             }
 310
 311             # MUST FIX: If a file already exists with the same name,
 312             # the created bs data set will be appended to this. IT
 313             # MUST BE OVERWRITTEN!
 314             $boot = data -> new( header      => \@header,
 315                                  idcolumn    => $self -> {'idcolumn'},
 316                                  ignoresign  => $self -> {'ignoresign'},
 317                                  individuals => \@bs_inds,
 318                                  filename    => $new_name,
 319                                  ignore_missing_files => 1,
 320                                  target      => 'mem' );
 321             $boot -> renumber_ascending;
 322             $boot -> _write;
 323             $boot -> target( $target );
 324           } else {
 325             # If we are resuming, we still need to generate the
 326             # pseudo-random sequence and initiate a data object
 327             for ( my $i = 1; $i <= $subjects; $i++ ) {
 328               random_uniform_integer(1,0,scalar @{$individuals}-1)
 329             }
 330             $boot = data -> new( idcolumn    => $self -> {'idcolumn'},
 331                                  ignoresign  => $self -> {'ignoresign'},
 332                                  filename    => $new_name,
 333                                  ignore_missing_files => 1,
 334                                  target      => $target );
 335           }
 336           if( $target eq 'disk'){
 337             $boot -> flush;
 338           }
 339         }
 340       }
 341 end resample
 342
 343 # }}} resample
 344
 345 # {{{ case_deletion
 346
 347 start case_deletion
 348       {
 349         # case_deletion creates subsets of the data. The number of
 350         # subsets is specified by the bins argument. The individuals
 351         # of each subset is selected randomly or in ascending
 352         # numerical order depending on the selection argument that can
 353         # be either 'consecutive' or 'random'. case_column must be
 354         # specified to give the method something to base the selection
 355         # on. Valid case_column values are either the column number
 356         # (pure digits) or the name of the column in the (optional)
 357         # header row.
 358         $self -> synchronize;
 359         my @header    = @{$self -> {'header'}};
 360         if ( not defined $case_column ) {
 361           debug -> die( message => "case_column must be specified" );
 362         } else {
 363           if ( not $case_column =~ /^\d/ ) {
 364             for ( my $i = 0; $i <= $#header; $i++ ) {
 365               $case_column = $i+1 if ( $header[$i] eq $case_column );
 366             }
 367           }
 368         }
 369         $bins = defined $bins ? $bins :
 370           scalar keys %{$self -> factors( column => $case_column)};
 371         my %factors   = %{$self -> factors( column => $case_column )};
 372         if ( $factors{'Non-unique values found'} eq '1' ) {
 373           debug -> die( message => "Individuals were found to have multiple values in column number $case_column. ".
 374                         "Column $case_column cannot be used for case deletion." );
 375         }
 376
 377         my $maxbins   = scalar keys %factors;
 378         my @ftrs      = sort { $a <=> $b } keys %factors;
 379         my $individuals = $self -> {'individuals'};
 380         my $maxkey    = scalar @{$individuals} - 1;
 381
 382         my ( @tmp_ftrs, @binsize ) =
 383             ((),());
 384         my ( $k, $j, $i ) = ( 0, 0, 0 );
 385         # Create the binsizes
 386         for ( $j = 0; $j < $maxbins; $j++ ) {
 387           $binsize[ $k++ ]++;
 388           $k = 0 if( $k >= $bins );
 389         }
 390         $self -> _fisher_yates_shuffle( array => \@ftrs ) if( $selection eq 'random' );
 391         for ( $k = 0; $k < $bins; $k++ ) {
 392           for ( $j = 0; $j < $binsize[ $k ]; $j++ ) {
 393 #           print "SK: ",$skipped_keys[ $k ]," F: ",$factors{ $ftrs[ $i ] },"\n";
 394             push( @{$skipped_keys[ $k ]}, @{$factors{ $ftrs[ $i ] }} );
 395             push( @{$skipped_values[ $k ]}, $ftrs[ $i++ ] );
 396           }
 397         }
 398
 399         for ( $k = 0; $k < $bins; $k++ ) {
 400           my @cd_inds = ();
 401         SELKEYS: foreach my $key ( 0..$maxkey ) {
 402           foreach my $skipped ( @{$skipped_keys[ $k ]} ) {
 403             if ( $key == $skipped ) {
 404               push( @{$skipped_ids[ $k ]}, $individuals ->
 405                     [ $skipped ] -> idnumber );
 406               next SELKEYS;
 407             }
 408           }
 409           push( @cd_inds, $individuals -> [ $key ] -> copy );
 410         }
 411           # Set ignore_missing_files = 1 to make it possible to get the result
 412           # in memory only
 413           my $newdata = data ->
 414               new ( header      => \@header,
 415                     ignoresign  => $self -> {'ignoresign'},
 416                     idcolumn    => $self -> {'idcolumn'},
 417                     individuals => \@cd_inds,
 418                     target      => $target,
 419                     filename    => 'cd'.$k+1 .'.dta',
 420                     ignore_missing_files => 1 );
 421           push( @subsets, $newdata );
 422         }
 423       }
 424 end case_deletion
 425
 426 # }}} case_deletion
 427
 428 # {{{ copy
 429 start copy
 430       {
 431           # filename: new data file name.
 432           #
 433           # target: keep the copy in memory ('mem') or write it to disk and flush the memory ('disk').
 434
 435         ($directory, $filename) = OSspecific::absolute_path( $directory, $filename );
 436
 437         # Clone self into new data object and set synced to 0 for
 438         # the copy
 439         cp($self -> full_name, $directory.$filename );
 440         $new_data = Storable::dclone( $self );
 441 #       $new_data -> {'synced'} = 0;
 442
 443         # Set the new file name for the copy
 444         $new_data -> directory( $directory );
 445         $new_data -> filename( $filename );
 446
 447 #       $new_data -> _write;
 448
 449 #       my $header;
 450 #       my $comment;
 451 #       my $ind_copies;
 452
 453 #       unless( defined $self -> {'header'} ) {
 454 #           $self -> synchronize;
 455 #       }
 456
 457 #       my @header = @{$self -> {'header'}};
 458 #       my @comment = @{$self -> {'comment'}};
 459 #       $header = \@header;
 460 #       $comment = \@comment;
 461 #       my @ind_copies;
 462 #       foreach my $individual ( @{$self -> {'individuals'}} ) {
 463 #           push( @ind_copies, $individual -> copy );
 464 #       }
 465 #       $ind_copies = \@ind_copies;
 466
 467 #       $data_copy = data -> new ( header      => $header,
 468 #                                  comment     => $comment,
 469 #                                  ignoresign  => $self -> {'ignoresign'},
 470 #                                  individuals => $ind_copies,
 471 #                                  idcolumn    => $self -> {'idcolumn'},
 472 #                                  debug       => $self -> {'debug'},
 473 #                                  filename    => $filename,
 474 #                                  target      => $target,
 475 #                                  ignore_missing_files => $ignore_missing_files);
 476
 477 #       if( $self -> {'target'} eq 'disk' ){
 478 #           $self -> flush;
 479 #       }
 480       }
 481 end copy
 482
 483 # }}} copy
 484
 485 # {{{ count_ind
 486
 487 start count_ind
 488       {
 489           # Returns the number of individuals in the data set.
 490         $self -> synchronize;
 491         $num = scalar @{$self -> {'individuals'}};
 492       }
 493 end count_ind
 494
 495 # }}} count_ind
 496
 497 # {{{ filename
 498 start filename
 499       {
 500         if ( defined $parm and $parm ne $self -> {'filename'} ) {
 501           $self -> {'filename'} = $parm;
 502           $self -> {'data_id'} = undef;
 503           $self -> _write;
 504         }
 505       }
 506 end filename
 507 # }}} filename
 508
 509 # {{{ fractions
 510
 511 start fractions
 512       {
 513         my %factors = $self -> factors( 'return_occurences' => 1,
 514                                         'unique_in_individual' => $unique_in_individual,
 515                                         'column_head' => $column_head,
 516                                         'column' => $column);
 517
 518         my $sum = 0;
 519         while (my ($factor, $amount) = each %factors) {
 520           if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
 521             next;
 522           } else {
 523             $sum += $amount;
 524           }
 525         }
 526         while (my ($factor, $amount) = each %factors) {
 527           if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
 528             next;
 529           } else {
 530             $fractions{$factor} = $amount/$sum;
 531           }
 532         }
 533       }
 534 end fractions
 535
 536 # }}} fractions
 537
 538 # {{{ factors
 539
 540 start factors
 541       {
 542         # Either column (number, starting at 1) or column_head must be specified.
 543         #
 544         # The default behaviour is to return a hash with the factors as keys
 545         # and as values references to arrays with the order numbers (not the ID numbers)
 546         # of the individuals that contain this factor
 547         #
 548         # If unique_in_individual is true (1), the returned hash will contain
 549         # an element with key 'Non-unique values found' and value 1 if any
 550         # individual contain more than one value in the specified column.
 551         #
 552         # Return occurences will calculate the occurence of each
 553         # factor value. Several occurences in one individual counts as
 554         # one occurence. The elements of the returned hash will have the factors
 555         # as keys and the number of occurences as values.
 556         #
 557
 558         $self -> synchronize;
 559
 560         # Check if $column(-index) is defined and valid, else try to find index
 561         # using column_head
 562         my $first_id = $self -> {'individuals'}[0];
 563
 564         debug -> die( message => "No individuals defined in data object based on ".
 565                       $self -> full_name ) unless ( defined $first_id );
 566
 567         unless ( defined $column && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
 568           unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
 569             debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
 570                           "or invalid column number: \"$column\".\n".
 571                           "Valid column numbers are 1 to ".scalar @{$first_id -> subject_data ->[0]}."\n".
 572                           "Valid column headers are (in no particular order):\n".
 573                           join(', ',keys(%{$self -> {'column_head_indices'}})) );
 574           } else {
 575             $column = $self -> {'column_head_indices'}{$column_head};
 576             debug -> warn( level   => 2,
 577                            message => "$column_head is in column number $column" );
 578           }
 579         }
 580
 581         my $key = 0;
 582         foreach my $individual ( @{$self -> {'individuals'}} ) {
 583           my @ifactors = keys %{$individual -> factors( column => $column )};
 584           if ( scalar @ifactors > 1 and $unique_in_individual ) {
 585             %factors = ( 'Non-unique values found' => 1 );
 586             last;
 587           }
 588           debug -> die( message => "No value found in column $column in individual ".
 589                         $individual -> idnumber ) if ( scalar @ifactors == 0 );
 590
 591           # Return occurences will calculate the occurence of each
 592           # factor value. Several occurences in one individual counts as
 593           # one occurence.
 594
 595           if ( $return_occurences ) {
 596             foreach my $ifactor ( @ifactors ) {
 597               $factors{$ifactor}++;
 598             }
 599           } else {
 600             foreach my $ifactor ( @ifactors ) {
 601               push( @{$factors{$ifactor}}, $key );
 602             }
 603           }
 604           $key++;
 605         }
 606       }
 607 end factors
 608
 609 # }}} factors
 610
 611 # {{{ find_individual
 612
 613 # start find_individual
 614 #         foreach my $tmp_ind ( @{$self -> individuals} ) {
 615 #         if ( $tmp_ind -> key == $key ) {
 616 #           $individual = $tmp_ind;
 617 #           last;
 618 #         }
 619 #       }
 620 #         if ( defined $individual ) {
 621 #         if ( $copy ) {
 622 #           $individual = $individual -> copy;
 623 #         }
 624 #       } else {
 625 #         print "No individual with key $key found in call to ".
 626 #           "data -> find_individual\n" if ( $self -> debug );
 627 #         }
 628 # end find_individual
 629
 630 # }}}
 631
 632 # {{{ format_data
 633 start format_data
 634       {
 635           # format the data for NONMEM (simple comma-separated layout)
 636         foreach my $individual ( @{$self -> {'individuals'}} ) {
 637           foreach my $row ( @{$individual -> subject_data} ) {
 638             push( @form_data, join( ',', @{$row} )."\n" );
 639           }
 640         }
 641       }
 642 end format_data
 643 # }}} format_data
 644
 645 # {{{ have_missing_data
 646 start have_missing_data
 647       {
 648         # Either I<column> or I<column_head> must be specified.
 649         #
 650         # This method looks through the data column with index I<column> or
 651         # (optional) header name I<column_head> and returns O if no missing
 652         # data indicator was found or 1 otherwise.
 653
 654         $self -> synchronize;
 655         my $first_id = $self -> {'individuals'}[0];
 656         debug -> die( message => "No individuals defined in data object based on ".
 657                       $self -> full_name ) unless ( defined $first_id );
 658         unless ( defined $column  && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
 659           unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
 660             die "Error in data -> have_missing_data: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
 661           } else {
 662             $column = $self -> {'column_head_indices'}{$column_head};
 663           }
 664         }
 665         $self -> flush if ( $self -> {'target'} eq 'disk' );
 666         $return_value = $self -> {'have_missing_data'} -> {$column};
 667       }
 668 end have_missing_data
 669 # }}} have_missing_data
 670
 671 # {{{ merge
 672 start merge
 673       {
 674         #$self -> synchronize;
 675         push( @{$self -> {'individuals'}}, @{$mergeobj -> individuals} );
 676       }
 677 end merge
 678 # }}} merge
 679
 680 # {{{ max
 681
 682 start max
 683       {
 684         # Either column or column_head must be specified. Column_head must be a string that
 685         # identifies a column in the (optional ) data file header.
 686
 687 # The if-statement below used to be a cache of allready calculated
 688 # means. But since individuals can be accessed in so many ways, we
 689 # don't know when this cache should be updated. Its easier to
 690 # recalculate the max. Maybe we can include this optimization in the
 691 # future, if it turns out to be a bottleneck
 692 #       my $tmp_column = $self -> {'column_head_indices'}{$column_head};
 693 #       if ( defined $self -> {'max'}[$tmp_column] ) {
 694 #         $return_value = $self -> {'max'}[$tmp_column] ;
 695 #       } else {
 696           $self -> synchronize;
 697           my $first_id = $self -> {'individuals'}[0];
 698           die "data -> max: No individuals defined in data object based on ",
 699             $self -> full_name,"\n" unless defined $first_id;
 700           unless ( defined $column  && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
 701             unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
 702               die "Error in data -> max: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
 703             } else {
 704               $column = $self -> {'column_head_indices'}{$column_head};
 705             }
 706           }
 707           foreach my $individual ( @{$self -> {'individuals'}} ) {
 708             my $ifactors = $individual -> factors( 'column' => $column );
 709             foreach ( keys %{$ifactors} ) {
 710               next if ( $_ == $self -> {'missing_data_token'} );
 711               if ( defined ($return_value) ) {
 712                 $return_value = $_ > $return_value ? $_ : $return_value;
 713               } else {
 714                 $return_value = $_;
 715               }
 716             }
 717           }
 718
 719 #         $self -> {'max'}[$column] = $return_value;
 720           $self -> flush if ( $self -> {'target'} eq 'disk' );
 721 #       }
 722       }
 723 end max
 724
 725 # }}} max
 726
 727 # {{{ min
 728
 729 start min
 730       {
 731         # See L</max>.
 732         my $tmp_column = $self -> {'column_head_indices'}{$column_head};
 733
 734 # The if-statement below used to be a cache of allready calculated
 735 # means. But since individuals can be accessed in so many ways, we
 736 # don't know when this cache should be updated. Its easier to
 737 # recalculate the min. Maybe we can include this optimization in the
 738 # future, if it turns out to be a bottleneck
 739 #       if ( defined $self -> {'min'}[$tmp_column] ) {
 740 #         $return_value = $self -> {'min'}[$tmp_column] ;
 741 #       } else {
 742           $self -> synchronize;
 743           my $first_id = $self -> {'individuals'}[0];
 744           die "data -> min: No individuals defined in data object based on ",
 745             $self -> full_name,"\n" unless defined $first_id;
 746           unless ( defined $column  && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
 747             unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
 748               die "Error in data -> min: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
 749             } else {
 750               $column = $self -> {'column_head_indices'}{$column_head};
 751             }
 752           }
 753           foreach my $individual ( @{$self -> {'individuals'}} ) {
 754             my $ifactors = $individual -> factors( 'column' => $column );
 755             foreach ( keys %{$ifactors} ) {
 756               next if ( $_ == $self -> {'missing_data_token'} );
 757               if ( defined ($return_value) ) {
 758                 $return_value = $_ < $return_value ? $_ : $return_value;
 759               } else {
 760                 $return_value = $_;
 761               }
 762             }
 763           }
 764 #         $self -> {'min'}[$column] = $return_value;
 765           $self -> flush if ( $self -> {'target'} eq 'disk' );
 766 #       }
 767       }
 768 end min
 769
 770 # }}} min
 771
 772 # {{{ median
 773
 774 start median
 775       {
 776           # See L</max>.
 777         $self -> synchronize;
 778         my $first_id = $self -> {'individuals'}[0];
 779         die "data -> median: No individuals defined in data object based on ",
 780           $self -> full_name,"\n" unless defined $first_id;
 781         unless ( defined $column  && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
 782           unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
 783             die "Error in data -> median: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
 784           } else {
 785             $column = $self -> {'column_head_indices'}{$column_head};
 786           }
 787         }
 788
 789         if( defined $self -> {'median'}[$column] ){
 790           return $self -> {'median'}[$column];
 791         }
 792
 793         my @median_array;
 794
 795         foreach my $individual ( @{$self -> {'individuals'}} ) {
 796           if( $unique_in_individual ){
 797             my $ifactors = $individual -> factors( 'column' => $column );
 798
 799             foreach ( keys %{$ifactors} ) {
 800               next if ( $_ == $self -> {'missing_data_token'} );
 801               push( @median_array, $_ );
 802             }
 803           } else {
 804             my $ifactors = $individual -> subject_data;
 805
 806             for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
 807               next if ( $ifactors -> [$i] -> [$column-1] == $self -> {'missing_data_token'} );
 808               push(@median_array, $ifactors -> [$i] -> [$column-1]);
 809             }
 810           }
 811         }
 812         @median_array = sort {$a <=> $b} @median_array ;
 813
 814         if( @median_array % 2 ){
 815           $return_value = $median_array[$#median_array / 2];
 816         } else {
 817           $return_value = ( $median_array[@median_array / 2] +
 818                             $median_array[(@median_array - 2) / 2] ) / 2;
 819         }
 820
 821         $self -> {'median'}[$column] = $return_value;
 822       }
 823 end median
 824
 825 # }}} median
 826
 827 # {{{ mean
 828
 829 start mean
 830   {
 831      # Returns mean value of a column
 832      # If a individual contains more then 1 value (i.e. if an individual has different values in different samples
 833      # a mean value of all individuals if calculate first, then the mean value of the column
 834      # If hi_cutoff is defined the mean function will cut all value below the cutoff,
 835      # and set their value to 0. It's used to calculate the HI-mean/LOW-mean of a column for e.g. Hockey-stick covariates
 836      # If both hi_cutoff and low_cutoff are defined only the hi_cutoff will be used.
 837      # See L</max>.
 838      my $tmp_column = $self -> {'column_head_indices'}{$column_head};
 839      $self -> synchronize;
 840      my $first_id = $self -> {'individuals'}[0];
 841        die "data -> mean: No individuals defined in data object based on ",
 842         $self -> full_name,"\n" unless defined $first_id;
 843      unless ( defined $column  && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
 844        unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
 845          die "Error in data -> mean: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
 846        } else {
 847               $column = $self -> {'column_head_indices'}{$column_head};
 848          }
 849      }
 850
 851         ## Here the calculation starts
 852         my $num_individuals = 0;
 853         my $sum = 0;
 854
 855         foreach my $individual ( @{$self ->{'individuals'}} ) {
 856
 857           my $ifactors = $individual -> subject_data;
 858           my $individual_sum = 0;
 859
 860           for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
 861             next if ( $ifactors -> [$i] -> [$column-1] == $self -> {'missing_data_token'} );
 862             if (defined $hi_cutoff) {
 863               if ($ifactors->[$i]->[$column-1]>$hi_cutoff) {
 864                 $individual_sum += $ifactors -> [$i] -> [$column-1]-$hi_cutoff;
 865               }
 866             }
 867             else {
 868               if (defined $low_cutoff) {
 869                 if ($ifactors->[$i]->[$column-1]<$low_cutoff) {
 870                   $individual_sum += $low_cutoff - $ifactors -> [$i] -> [$column-1];
 871                 }
 872               }
 873               else {
 874                 $individual_sum += $ifactors -> [$i] -> [$column-1];
 875               }
 876             }
 877           }
 878           $sum += $individual_sum/($#{$ifactors}+1);
 879           $num_individuals++;
 880         }
 881         $return_value = $sum / $num_individuals;
 882 }
 883
 884 end mean
 885
 886 # }}} mean
 887
 888 # {{{ sd
 889
 890 start sd
 891   {
 892     # This sub returns standard deviation for a specific column
 893     # If there are more than one sample/individual the value used for that specific
 894     # individual is the mean value of its samples.
 895     # The cut-offs are for hockey stick variables. I.e. If one individual value is
 896     # lower than the hi-cutoff the individual value will be zero.
 897     # HI_cutoff is used to calculate the HI-mean of a column.
 898     # If cut_off is undef it won't be used
 899     # See L</max>.
 900      my $tmp_column = $self -> {'column_head_indices'}{$column_head};
 901      $self -> synchronize;
 902      my $first_id = $self -> {'individuals'}[0];
 903      die "data -> sd: No individuals defined in data object based on ",
 904        $self -> full_name,"\n" unless defined $first_id;
 905      unless ( defined $column  && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
 906         unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
 907             die "Error in data -> sd: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
 908         } else {
 909               $column = $self -> {'column_head_indices'}{$column_head};
 910           }
 911       }
 912
 913         ## Here the calculation starts
 914         my $num_individuals = 0;
 915         my $sum = 0;
 916         my $mean;
 917         if (defined $hi_cutoff) {
 918           $mean = $self->mean(column   => $column,
 919                               hi_cutoff => $hi_cutoff);
 920         } elsif (defined $low_cutoff) {
 921           $mean = $self->mean(column   => $column,
 922                               low_cutoff => $low_cutoff);
 923         } else {
 924           $mean = $self->mean(column => $column);
 925         }
 926
 927         foreach my $individual ( @{$self -> {'individuals'}} ) {
 928           my $ifactors = $individual -> subject_data;
 929           my $individual_sum = 0;
 930
 931           for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
 932             next if ( $ifactors -> [$i] -> [$column-1] == $self -> {'missing_data_token'} );
 933             if (defined $hi_cutoff) {
 934               if ($ifactors->[$i]->[$column-1]>$hi_cutoff) {
 935                 $individual_sum += $ifactors -> [$i] -> [$column-1]-$hi_cutoff;
 936               }
 937             } else {
 938               if (defined $low_cutoff) {
 939                 if ($ifactors->[$i]->[$column-1]<$low_cutoff) {
 940                   $individual_sum += $low_cutoff - $ifactors -> [$i] -> [$column-1];
 941                 }
 942               } else {
 943                 $individual_sum += $ifactors -> [$i] -> [$column-1];
 944               }
 945             }
 946           }
 947           $sum += ($individual_sum/($#{$ifactors}+1) - $mean) ** 2;
 948           $num_individuals++;
 949         }
 950         $return_value = (1/($num_individuals-1)*$sum) ** 0.5;
 951 }
 952
 953 end sd
 954
 955 # }}} sd
 956
 957 # {{{ range
 958 start range
 959       {
 960         # See L</max>.
 961         my $tmp_column = $self -> {'column_head_indices'}{$column_head};
 962         if ( defined $self -> {'range'}[$tmp_column] ) {
 963           $return_value = $self -> {'range'}[$tmp_column];
 964         } else {
 965           my $old_target = $self -> {'target'};
 966           $self -> {'target'} = 'mem';
 967           $self -> synchronize;
 968           $return_value = $self -> max( column      => $column,
 969                                         column_head => $column_head ) -
 970                                           $self -> min( column      => $column,
 971                                                         column_head => $column_head );
 972           $self -> {'range'}[$column] = $return_value;
 973           if ( $old_target eq 'disk' ) {
 974             $self -> flush if ( $self -> {'target'} eq 'disk' );
 975             $self -> {'target'} = 'disk';
 976           }
 977         }
 978       }
 979 end range
 980 # }}} range
 981
 982 # {{{ recalc_column
 983 start recalc_column
 984       {
 985           # Recalculates a column based on expression. Also, see L</max>.
 986         $self -> synchronize;
 987
 988         # Check if $column(-index) is defined and valid, else try to find index using column_head
 989         my $first_id = $self -> {'individuals'}[0];
 990         die "data -> recalc_column: No individuals defined in data object based on ",
 991           $self -> full_name,"\n" unless defined $first_id;
 992         unless ( defined $column  && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
 993           if(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
 994             die "Error in data -> recalc_column: unknown column: \"$column_head\" or column number: \"$column\"\n";
 995           } else {
 996             $column = $self -> {'column_head_indices'}{$column_head};
 997           }
 998         }
 999
1000         for my $individual ( @{$self -> {'individuals'}} ) {
1001           $individual -> recalc_column( column     => $column,
1002                                         expression => $expression );
1003         }
1004       }
1005 end recalc_column
1006 # }}} recalc_column
1007
1008 # {{{ renumber_ascending
1009
1010 start renumber_ascending
1011       {
1012 # Renumbers the individuals (changes the subject identifiers) so that
1013 # all have unique integer numbers starting with start_at and
1014 # ascending. The primary use of this
1015 # method is not to order the individuals after their identifiers but to
1016 # ensure that all individuals have unique identifiers.
1017
1018         $self -> synchronize;
1019         foreach my $individual ( @{$self -> {'individuals'}} ) {
1020           $individual -> idnumber ( $start_at++ );
1021         }
1022         $self -> {'synced'} = 0;
1023       }
1024 end renumber_ascending
1025
1026 # }}} renumber_ascending
1027
1028 # {{{ renumber_descending
1029
1030 start renumber_descending
1031       {
1032           # See L</renumber_ascending>.
1033         $self -> synchronize;
1034         foreach my $individual ( @{$self -> {'individuals'}} ) {
1035           $individual -> idnumber ( $start_at-- );
1036         }
1037         $self -> {'synced'} = 0;
1038       }
1039 end renumber_descending
1040
1041 # }}} renumber_descending
1042
1043 # {{{ single_valued_data
1044
1045 start single_valued_data
1046       {
1047         # Usage:
1048         #
1049         # ($single_value_data_set, $remainder, $column_indexes) =
1050         #      $data_object -> single_valued_data( subset_name         => 'subset.dta',
1051         #                                          remainder_name      => 'remainder.dta',
1052         #                                          target              => 'disk',
1053         #                                          do_not_test_columns => [1..18,24,26];
1054         #
1055         # my $single_value_column_indexes = $column_indexes -> [0];
1056         # my $all_other_column_indexes    = $column_indexes -> [1];
1057         #
1058         # Analyses the content of each column, based on the
1059         # ID column, and returns two new data objects: One
1060         # that contains all columns that is has only one value per
1061         # individual and one that contains the
1062         # remainding data. This is useful for creating compact 'extra'
1063         # data sets that can be read in via user-defined sub-routines
1064         # when the number of columns needed exceeds the maximum that
1065         # NONMEM allows (e.g. 20 in NONMEM version V).
1066         #
1067         # The I<do_not_test_columns> argument specifies on which columns
1068         # to skip the single value test
1069
1070         my @multi_value_flags;
1071         my @individuals = @{$self -> {'individuals'}};
1072         # Initiate the flags:
1073         if ( defined $individuals[0] ) {
1074           my @data = @{$individuals[0] -> {'subject_data'}};
1075           for ( my $i = 0; $i < scalar @{$data[0]}; $i++ ) {
1076             my $dnt_flag = 0;
1077             foreach my $dntc ( @do_not_test_columns ) {
1078               $dnt_flag = 1 if ( $i == $dntc - 1 );
1079             }
1080             $multi_value_flags[$i] = $dnt_flag;
1081           }
1082         } else {
1083           die "data -> single_valued_data: No data in ID number 1\n";
1084         }
1085         # Collect the stats
1086         for ( my $id = 0; $id <= $#individuals; $id++ ) {
1087           my @data = @{$individuals[$id] -> {'subject_data'}};
1088           for ( my $j = 0; $j < scalar @{$data[0]}; $j++ ) {
1089             my %col_unique;
1090             for ( my $i = 0; $i <= $#data; $i++ ) {
1091               $col_unique{$data[$i][$j]}++;
1092             }
1093             my $factors = scalar keys %col_unique;
1094             $multi_value_flags[$j]++ if ( $factors > 1 );
1095           }
1096         }
1097         for ( my $i = 0; $i <= $#multi_value_flags; $i++ ) {
1098           if ( $multi_value_flags[$i] ) {
1099             push ( @{$column_indexes[1]}, $i + 1);
1100           } else {
1101             push ( @{$column_indexes[0]}, $i + 1);
1102           }
1103         }
1104         ( $single_value_data_set, $remainder ) =
1105           $self -> subset_vertically( column_indexes      => $column_indexes[0],
1106                                       subset_name         => $subset_name,
1107                                       return_remainder    => 1,
1108                                       remainder_name      => $remainder_name,
1109                                       target              => $target,
1110                                       keep_first_row_only => 1);
1111       }
1112 end single_valued_data
1113
1114 # }}}
1115
1116 # {{{ subset_vertically
1117
1118 start subset_vertically
1119       {
1120         # Usage:
1121         #
1122         # $subset = $data_object -> subset_vertically ( column_indexes => [1,2,6],
1123         #                                               subset_name    => 'subset.dta' );
1124         #
1125         # This basic usage returns a new data object containing
1126         # columns 1,2 and 6 from the original data plus the
1127         # idcolumn. The new data object will be associated with the
1128         # file 'subset.dta'.
1129         #
1130         # You get the remaining data, i.e. the original data minus
1131         # the created subset by specifying
1132         #
1133         # ( $subset, $remainder ) =
1134         #      $data_object -> subset_vertically ( column_indexes   => [1,2,6],
1135         #                                          subset_name      => 'subset.dta',
1136         #                                          return_remainder => 1,
1137         #                                          remainder_name   => 'remainder.dta' );
1138         #
1139         # If you would like to flush the created data sets to disk and
1140         # save memory, set the I<target> argument to 'disk'. The
1141         # default value 'mem' will keep the whole data object in RAM.
1142         #
1143         # The I<keep_first_row_only> argument can be used to reduce
1144         # the size of the subset data obejct by excluding all but the
1145         # first row of data from each individual.
1146         #
1147         my @individuals = @{$self -> {'individuals'}};
1148         # Create remainder index array if necessary
1149         my @remainder_indexes;
1150         if ( defined $individuals[0] ) {
1151           my @data = @{$individuals[0] -> {'subject_data'}};
1152           my $idcolumn = $individuals[0] -> {'idcolumn'};
1153 #         print "IC: $idcolumn\n";
1154           my $id_flag = 0;
1155           foreach my $use_index ( @column_indexes ) {
1156             $id_flag = 1 if ( $use_index == $idcolumn );
1157           }
1158           if ( $return_remainder ) {
1159 #           @remainder_indexes = ( $idcolumn );
1160             for ( my $i = 0; $i < scalar @{$data[0]}; $i++ ) {
1161               my $rem_flag = 1;
1162               foreach my $use_index ( @column_indexes ) {
1163                 $rem_flag = 0 if ( $i == $use_index -1 );
1164 # or
1165 #                                  $i == $idcolumn -1 );
1166               }
1167               push( @remainder_indexes, $i + 1 ) if ( $rem_flag );
1168             }
1169             unshift( @remainder_indexes, $idcolumn ) if ( $id_flag );
1170           }
1171           unshift( @column_indexes, $idcolumn ) unless ( $id_flag );
1172         } else {
1173           die "data -> single_valued_data: No data in ID number 1\n";
1174         }
1175
1176 #       print "SS: @column_indexes\n";
1177 #       print "R : @remainder_indexes\n";
1178
1179         my @new_ids;
1180         my @new_ids_2;
1181         for ( my $id = 0; $id <= $#individuals; $id++ ) {
1182           my $idnumber = $individuals[$id] -> idnumber;
1183           my $idcolumn = $individuals[$id] -> idcolumn;
1184           my @data = @{$individuals[$id] -> {'subject_data'}};
1185           my @new_data;
1186           my @new_data_2;
1187           my $use_rows = $keep_first_row_only ? 0 : $#data;
1188           for ( my $i = 0; $i <= $use_rows; $i++ ) {
1189             my @new_row;
1190             foreach my $use_index ( @column_indexes ) {
1191               push( @new_row, $data[$i][$use_index-1] );
1192             }
1193 #           print "@new_row $#new_row\n";
1194             push( @new_data, \@new_row );
1195           }
1196           for ( my $i = 0; $i <= $#data; $i++ ) {
1197             if ( $return_remainder ) {
1198               my @new_row_2;
1199               foreach my $use_index ( @remainder_indexes ) {
1200                 push( @new_row_2, $data[$i][$use_index-1] );
1201               }
1202 #             print "@new_row_2 $#new_row_2\n";
1203               push( @new_data_2, \@new_row_2 );
1204             }
1205           }
1206           my $new_id = data::individual -> new( idnumber     => $idnumber,
1207                                                 idcolumn     => $idcolumn,
1208                                                 subject_data => \@new_data );
1209           push( @new_ids, $new_id );
1210           if ( $return_remainder ) {
1211             my $new_id_2;
1212             $new_id_2 = data::individual -> new( idnumber     => $idnumber,
1213                                                  idcolumn     => $idcolumn,
1214                                                  subject_data => \@new_data_2 );
1215           push( @new_ids_2, $new_id_2 );
1216           }
1217         }
1218         my @header = @{$self -> {'header'}};
1219         my @new_header;
1220         foreach my $use_index ( @column_indexes ) {
1221           push( @new_header, @header[$use_index-1] );
1222         }
1223         my @comment = @{$self -> {'comment'}};
1224         my $comment = \@comment;
1225         $subset = data -> new ( filename             => $subset_name,
1226                                 directory            => $self -> {'directory'},
1227                                 ignoresign           => $self -> {'ignoresign'},
1228                                 header               => \@new_header,
1229                                 comment              => $comment,
1230                                 individuals          => \@new_ids,
1231                                 target               => $target,
1232                                 ignore_missing_files => 1 );
1233         if ( $return_remainder ) {
1234           my @new_header_2;
1235           foreach my $use_index ( @remainder_indexes ) {
1236             push( @new_header_2, @header[$use_index-1] );
1237           }
1238           $remainder = data -> new ( filename             => $remainder_name,
1239                                      directory            => $self -> {'directory'},
1240                                      ignoresign           => $self -> {'ignoresign'},
1241                                      header               => \@new_header_2,
1242                                      comment              => $comment,
1243                                      individuals          => \@new_ids_2,
1244                                      target               => $target,
1245                                      ignore_missing_files => 1 );
1246         }
1247       }
1248 end subset_vertically
1249
1250 # }}}
1251
1252 # {{{ subsets
1253
1254 start subsets
1255       {
1256 #       if ( defined $expression and defined $bins ) {
1257 #         die "data -> subset: expression and bins may not both be specified\n";
1258 #       }
1259 #       if ( not ( defined $expression or defined $bins ) ) {
1260 #         die "data -> subset: expression or bins must be specified\n";
1261 #       }
1262         $self -> synchronize;
1263         my @header  = @{$self -> {'header'}};
1264         my @comment = @{$self -> {'comment'}};
1265         my @subset_ids= ();
1266         my %rnd_ids;
1267         my $key = 0;
1268         my @ids = @{$self -> {'individuals'}};
1269         if ( defined $stratify_on ) {
1270             my $work_data = $self -> copy( filename => 'work_data.dta',
1271                                            target   => 'mem' );
1272           my %strata = %{$work_data -> factors( column => $stratify_on )};
1273
1274           while ( my ( $factor, $keys ) = each %strata ) {
1275               foreach my $key ( @{$keys} ) {
1276                   my $rnd_num = rand;
1277                   while ( defined $rnd_ids{$factor}{$rnd_num} ) {
1278                       $rnd_num = rand;
1279                   }
1280                   $rnd_ids{$factor}{$rnd_num} = $ids[$key];
1281               }
1282           }
1283           my $first = 1;
1284           while ( my ( $factor, $rnd_nums ) = each %rnd_ids ) {
1285               my @sort_rnd_nums = sort { $a <=> $b } keys %{$rnd_nums};
1286               for ( my $i = 0; $i <= $#sort_rnd_nums; $i ) {
1287                   for ( my $j = 0; $j < $bins; $j++ ) {
1288                       if ( $first ) {
1289                           push( @subset_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy] );
1290                           push( @incl_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber] );
1291                       } else {
1292                           push( @{$subset_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy );
1293                           push( @{$incl_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber );
1294                       }
1295                       $i++;
1296                       last if $i > $#sort_rnd_nums;
1297                   }
1298                   $first = 0;
1299               }
1300           }
1301           for ( my $j = 0; $j < $bins; $j++ ) {
1302             my $sdata = data -> new ( header               => \@header,
1303                                       comment              => \@comment,
1304                                       ignoresign           => $self -> {'ignoresign'},
1305                                       individuals          => $subset_ids[$j],
1306                                       ignore_missing_files => 1,
1307                                       target               => $target,
1308                                       idcolumn             => $self -> {'idcolumn'},
1309                                       filename             => "subset_$j.dta" );
1310             #$sdata -> _write;
1311             push( @subsets, $sdata );
1312           }
1313         } else {
1314           for ( my $i = 0; $i <= $#ids; $i++ ) {
1315             my $rnd_num = rand;
1316             while ( defined $rnd_ids{$rnd_num} ) {
1317               $rnd_num = rand;
1318             }
1319             $rnd_ids{$rnd_num} = $ids[$i];
1320           }
1321           my @keys = sort { $a <=> $b } keys %rnd_ids;
1322           my $first = 1;
1323           for ( my $i = 0; $i <= $#keys; $i ) {
1324             for ( my $j = 0; $j < $bins; $j++ ) {
1325               if ( $first ) {
1326                 push( @subset_ids, [$rnd_ids{$keys[$i]} -> copy] );
1327                 push( @incl_ids, [$rnd_ids{$keys[$i]} -> idnumber] );
1328               } else {
1329                 push( @{$subset_ids[$j]}, $rnd_ids{$keys[$i]} -> copy );
1330                 push( @{$incl_ids[$j]}, $rnd_ids{$keys[$i]} -> idnumber );
1331               }
1332               $i++;
1333               last if $i > $#keys;
1334             }
1335             $first = 0;
1336           }
1337           for ( my $j = 0; $j < $bins; $j++ ) {
1338             my $sdata = data -> new ( header               => \@header,
1339                                       comment              => \@comment,
1340                                       ignoresign           => $self -> {'ignoresign'},
1341                                       individuals          => $subset_ids[$j],
1342                                       ignore_missing_files => 1,
1343                                       target               => $target,
1344                                       idcolumn             => $self -> {'idcolumn'},
1345                                       filename             => "subset_$j.dta" );
1346             #$sdata -> _write;
1347             push( @subsets, $sdata );
1348           }
1349         }
1350       }
1351 end subsets
1352
1353 # }}} subsets
1354
1355 # {{{ subset
1356
1357 start subset
1358       {
1359         $self -> synchronize;
1360         my @header  = @{$self -> {'header'}};
1361         my @comment = @{$self -> {'comment'}};
1362         my @subset_inds = ();
1363         my $key = 0;
1364         foreach my $individual ( @{$self -> {'individuals'}} ) {
1365           if ( $individual -> evaluate_expression( column     => $based_on,
1366                                                    expression => $expression ) ) {
1367             push( @subset_inds, $individual -> copy );
1368             push( @incl_individuals, $individual -> idnumber );
1369             push( @included_keys, $key );
1370           }
1371           $key++;
1372         }
1373         $subset = data -> new ( header      => \@header,
1374                                 comment     => \@comment,
1375                                 ignoresign  => $self -> {'ignoresign'},
1376                                 individuals => \@subset_inds,
1377                                 idcolumn    => $self -> {'idcolumn'},
1378                                 filename    => "subset.dta" );
1379       }
1380 end subset
1381
1382 # }}} subset
1383
1384 # {{{ target
1385 start target
1386       {
1387         if ( $parm eq 'disk' and $self -> {'target'} eq 'mem' ) {
1388           $self -> {'target'} = 'disk';
1389           $self -> flush;
1390         } elsif ( $parm eq 'mem' and $self -> {'target'} eq 'disk' ) {
1391           $self -> {'target'} = 'mem';
1392           $self -> synchronize;
1393         }
1394       }
1395 end target
1396 # }}}
1397
1398 # {{{ _write
1399
1400 start _write
1401     {
1402       die "ERROR: data -> _write: No filename set in data object.\n"
1403           if( $filename eq '' );
1404
1405       if( not defined $self -> {'individuals'} ){
1406         unless( $filename eq $self -> full_name ){
1407           $self -> synchronize;
1408         }
1409       }
1410
1411       open(FILE,">$filename") ||
1412           die "Could not create $filename\n";
1413       if ( defined $self -> comment ) {
1414         my @comment   = @{$self -> comment};
1415         for ( @comment ) { print ( FILE ); }
1416       }
1417       if ((defined $self -> header) and (defined $self -> ignoresign)) {
1418         print FILE $self -> ignoresign,join(',',@{$self -> header}),"\n";
1419       }
1420       my @data = @{$self -> format_data};
1421       for ( @data ) {
1422         print ( FILE );
1423       }
1424       close(FILE);
1425
1426       if ( $PsN::config -> {'_'} -> {'use_database'} ) {
1427         # Backslashes messes up the sql syntax
1428         my $file_str = $self->{'filename'};
1429         my $dir_str = $self->{'directory'};
1430         $file_str =~ s/\\/\//g;
1431         $dir_str =~ s/\\/\//g;
1432
1433         # md5sum
1434         my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
1435         my ( $date_cmd, $time_cmd );
1436         if ( $Config{osname} eq 'MSWin32' ) {
1437           $date_cmd = 'date /T';
1438           $time_cmd = 'time /T';
1439         } else {
1440           # Assuming UNIX
1441           $date_cmd = 'date';
1442           $time_cmd = 'time';
1443         }
1444
1445         my $date_str = `$date_cmd`;
1446         my $time_str = `$time_cmd`;
1447         chomp($date_str);
1448         chomp($time_str);
1449         my $date_time = "$date_str $time_str";
1450
1451         my $dbh = DBI -> connect("DBI:mysql:host=localhost;databse=psn",
1452                                  "psn", "psn_test",
1453                                  {
1454                                   'RaiseError' => 1});
1455         my $sth;
1456         if ( defined $self -> {'data_id'} ) {
1457           $sth = $dbh -> prepare( "UPDATE psn.data ".
1458                                   "SET filename='$file_str',date='$date_time',".
1459                                   "directory='$dir_str',md5sum='$md5sum' ".
1460                                   "WHERE data_id='".$self -> {'data_id'}."'" );
1461           $sth -> execute or debug -> die( message => $sth->errstr ) ;
1462         } else {
1463           $sth = $dbh -> prepare("INSERT INTO psn.data (filename,date,directory,md5sum) ".
1464                                  "VALUES ('$file_str', '$date_time', '$dir_str','".
1465                                  $md5sum."' )");
1466           $sth -> execute;
1467           $self -> {'data_id'} = $sth->{'mysql_insertid'};
1468         }
1469         $sth -> finish;
1470         $dbh -> disconnect;
1471       }
1472     }
1473 end _write
1474
1475 # }}} _write
1476
1477 # {{{ flush
1478 start flush
1479     {
1480       # synchronizes the object with the file on disk and empties
1481       # most of the objects attributes to save memory.
1482       if( defined $self -> {'individuals'} and
1483           ( !$self -> {'synced'} or $force ) ) {
1484         $self -> _write;
1485       }
1486       $self -> {'header'} = undef;
1487       $self -> {'comment'} = undef;
1488       $self -> {'individuals'} = undef;
1489       $self -> {'synced'} = 0;
1490     }
1491 end flush
1492 # }}} flush
1493
1494 # {{{ synchronize
1495
1496 start synchronize
1497     {
1498       # synchronizes the object with the file on disk
1499       unless( $self -> {'synced'} ){
1500         if( defined $self -> {'individuals'} and
1501             scalar @{$self -> {'individuals'}} > 0 ){
1502           # We should not read new data from file if we
1503           # have an individuals defined?
1504           # Perhaps there should be an attribute
1505           # 'from_file' that overrides this and reads in
1506           # the data from the file specified in filename
1507           # and overwrites whatever the object already
1508           # contains?
1509 #           if( -e $self -> {'filename'} ){
1510 #             $self -> _read_header;
1511 #             $self -> _read_individuals;
1512 #           }
1513           $self -> _write;
1514         } else {
1515           if( -e $self -> full_name ){
1516             $self -> _read_header;
1517             $self -> _read_individuals;
1518           } else {
1519             debug -> die( message => "Fatal error: datafile: " . $self -> full_name . " does not exist." );
1520             return;
1521           }
1522         }
1523       }
1524       my $i = 1;
1525       foreach my $head ( @{$self -> {'header'}} ){
1526         $self -> {'column_head_indices'} -> {$head} = $i;
1527         $i++;
1528       }
1529       $self -> {'synced'} = 1;
1530     }
1531 end synchronize
1532
1533 # }}} synchronize
1534
1535 # {{{ _fisher_yates_shuffle
1536
1537 start _fisher_yates_shuffle
1538       {
1539         my $arr_ref = $parm{'array'};
1540         debug -> warn( level   => 1,
1541                        message => "Array of zero length received" )
1542           if ( scalar @{$arr_ref} < 1 );
1543         my $i;
1544         for ($i = @$arr_ref; --$i; ) {
1545           my $j = random_uniform_integer(1,0,$i);
1546 #          my $j = int rand ($i+1);
1547 #         print "$j $j_new\n";
1548           @$arr_ref[$i,$j] = @$arr_ref[$j,$i];
1549         }
1550       }
1551 end _fisher_yates_shuffle
1552
1553 # }}} _fisher_yates_shuffle
1554
1555 # {{{ _read_header
1556
1557 start _read_header
1558       {
1559         my $filename   = $self -> full_name;
1560         my $ignoresign = $self -> ignoresign;
1561         my ( @data, @new_record, $row, $tmp_row, @header, $hdrstring );
1562         open(DATAFILE,"$filename") ||
1563           die "Could not open $filename for reading";
1564         my $columns;
1565         while (<DATAFILE>) {
1566           $tmp_row    = $_;
1567           #     @new_record = split(/\,|\s+/,$_);
1568           if ( ! (/^\s*\d+|^\s*\./) ) {
1569             $data[$row] = $tmp_row;
1570             $row++;
1571           } else {
1572             # We have reached the first data-row, return.
1573             $columns = scalar split(/\,\s*|\s+/);
1574             last;
1575           }
1576         }
1577         close(DATAFILE);
1578         chomp( $hdrstring = pop(@data));
1579         @header = split(/\,\s*|\s+/,$hdrstring);
1580         $header[0] =~ s/$ignoresign//
1581           if ( defined $self->ignoresign );
1582         shift( @header ) if ( $header[0] eq "" );
1583         $self -> {'header'} = \@header;
1584         $self -> {'comment'} = \@data;
1585         if ( $PsN::config -> {'_'} -> {'use_database'} ) {
1586           my $dbh = DBI -> connect("DBI:mysql:host=localhost;databse=psn",
1587                                    "psn", "psn_test",
1588                                    {'RaiseError' => 1});
1589           if ( scalar @header < 1 ) {
1590             for ( my $i = 1; $i <= $columns; $i++ ) {
1591               push( @header, $i );
1592             }
1593           }
1594           for ( my $i = 0; $i <= $#header; $i++ ) {
1595             my $sth = $dbh -> prepare("INSERT INTO psn.data_column ".
1596                                       "(name,number,data_id) ".
1597                                       "VALUES ('".$header[$i]."', '".($i+1).
1598                                       "', '".$self -> {'data_id'}."' )");
1599             $sth -> execute;
1600             push( @{$self -> {'data_column_ids'}}, $sth->{'mysql_insertid'} );
1601             $sth -> finish;
1602           }
1603           $dbh -> disconnect;
1604         }
1605       }
1606 end _read_header
1607
1608 # }}} _read_header
1609
1610 # {{{ _read_individuals
1611
1612 start _read_individuals
1613       {
1614         my $idcol        = $self -> idcolumn;
1615         my $filename     = $self -> full_name;
1616         debug -> warn( level   => 1,
1617                        message => "Building array of individuals from file " . $self -> {'filename'} );
1618         open(DATAFILE,"$filename") ||
1619           die "Could not open $filename for reading";
1620         my ( @new_row, $new_ID, $old_ID, @init_data );
1621         my $buffer;
1622         my $lines = 0;
1623         while (sysread DATAFILE, $buffer, 4096) {
1624           $lines += ($buffer =~ tr/\n//);
1625         }
1626         seek( DATAFILE, 0,0 );
1627
1628         # For status bar:
1629         my $steps = 51;
1630         my $step = $lines/$steps;
1631         my $counter = 1;
1632         my $step_counter = 1;
1633
1634         ui -> print( category => 'scm',
1635                      message  => "Reading data file: ".$self -> filename );
1636
1637         my ( $sth, $dbh, $first_row_id, $first_value_id );
1638         my $insert = 1;
1639         if ( $PsN::config -> {'_'} -> {'use_database'} ) {
1640           $dbh = DBI -> connect("DBI:mysql:host=localhost;databse=psn",
1641                                 "psn", "psn_test",
1642                                 {'RaiseError' => 1});
1643           my $sth = $dbh -> prepare( "SELECT data_row_id FROM psn.data_row ".
1644                                      "WHERE data_id='".$self -> {'data_id'}."'" );
1645           $sth -> execute or debug -> die( message => $sth->errstr ) ;
1646           my $select_arr = $sth -> fetchall_arrayref;
1647           if ( scalar @{$select_arr} > 0 ) {
1648             for ( my $i = 0; $i < scalar @{$select_arr}; $i++ ) {
1649               push( @{$self -> {'data_row_ids'}}, $select_arr->[$i][0] );
1650             }
1651             $sth = $dbh -> prepare( "SELECT data_value_id FROM psn.data_value ".
1652                                     "WHERE data_id='".$self -> {'data_id'}."'" );
1653             $sth -> execute or debug -> die( message => $sth->errstr ) ;
1654             my $select_val = $sth -> fetchall_arrayref;
1655             for ( my $i = 0; $i < scalar @{$select_val}; $i++ ) {
1656               push( @{$self -> {'data_value_ids'}}, $select_val->[$i][0] );
1657             }
1658             $insert = 0;
1659             $dbh -> disconnect;
1660           } else {
1661             $dbh -> do( "LOCK TABLES psn.data_row WRITE, psn.data_value WRITE" );
1662             $sth = $dbh -> prepare( "SELECT MAX(data_row_id) FROM psn.data_row" );
1663             $sth -> execute or debug -> die( message => $sth->errstr ) ;
1664             my $select_arr = $sth -> fetchall_arrayref;
1665             $first_row_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
1666             $sth = $dbh -> prepare( "SELECT MAX(data_value_id) FROM psn.data_value" );
1667             $sth -> execute or debug -> die( message => $sth->errstr ) ;
1668             my $select_arr = $sth -> fetchall_arrayref;
1669             $first_value_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
1670           }
1671           $sth -> finish;
1672         }
1673
1674         my $insert_rows;
1675         my $insert_values;
1676         my $row_counter = 0;
1677         while ( <DATAFILE> ) {
1678           s/^ *//;
1679           my @new_row   = split(/\,\s*|\s+/);
1680           # This regexp check is not time consuming.
1681           if ( /^\s*\d+|^\s*\./ ) {
1682             $new_ID = $new_row[$idcol-1]; # index starts at 0
1683             $old_ID = $new_ID if ( not defined $old_ID );
1684
1685             # Check if column miss data at some row (This adds about 30% of init time)
1686             my $mdt = $self -> {'missing_data_token'};
1687             for( my $i = 0; $i <= $#new_row; $i++ ){
1688                 $self -> {'have_missing_data'} -> {$i+1} = 1
1689                   if( $new_row[$i] == $mdt ); # == is slower but safer than eq
1690             }
1691             if ( $PsN::config -> {'_'} -> {'use_database'} and $insert ) {
1692               $row_counter++;
1693               $insert_rows = $insert_rows."," if ( defined $insert_rows );
1694               $insert_rows = $insert_rows.
1695                 "('$row_counter', '".$self -> {'data_id'}."' )";
1696               for ( my $j = 0; $j <= $#new_row; $j++ ) {
1697                 $insert_values = $insert_values."," if ( defined $insert_values );
1698                 $insert_values = $insert_values.
1699                   "('".$new_row[$j]."', '".
1700                     ($first_row_id+$row_counter)."', '".
1701                       $self -> {'data_column_ids'}->[$j].
1702                         "', '".$self -> {'data_id'}."' )";
1703               }
1704             }
1705
1706             if ( $new_ID != $old_ID ) {
1707               my @subject_data = @init_data;
1708               push( @{$self -> {'individuals'}},
1709                     data::individual -> new ( idcolumn     => $idcol,
1710                                               subject_data => \@subject_data,
1711                                               data_id      => $self -> {'data_id'} ) );
1712               @init_data = (\@new_row);
1713             } else {
1714               push( @init_data, \@new_row );
1715             }
1716             $old_ID = $new_ID;
1717           }
1718           if ( $step > 1 ) {
1719             if ( not ( $counter % $step ) ) {
1720               my $nl = $step_counter == $steps ? "" : "\r";
1721               ui -> print( category => 'scm',
1722                            message  => ui -> status_bar( sofar => $counter,
1723                                                          goal  => $lines,
1724                                                          width => $steps-1 ).$nl,
1725                            wrap     => 0,
1726                            newline  => 0 );
1727               $step_counter++;
1728             }
1729           }
1730           $counter++;
1731         }
1732         if ( $PsN::config -> {'_'} -> {'use_database'} and $insert ) {
1733           $dbh -> do("INSERT INTO psn.data_row ".
1734                      "(number,data_id) ".
1735                      "VALUES ".$insert_rows);
1736           push( @{$self -> {'data_row_ids'}}, ($first_row_id..$first_row_id+$row_counter) );
1737           $dbh -> do( "INSERT INTO psn.data_value ".
1738                       "(value,data_row_id,data_column_id,data_id) ".
1739                       "VALUES ".$insert_values );
1740           push( @{$self -> {'data_value_ids'}},
1741                 ($first_value_id..$first_value_id+($row_counter*
1742                                                    scalar @{$self->{'data_column_ids'}})));
1743           $dbh -> do( "UNLOCK TABLES" );
1744           $dbh -> disconnect;
1745         }
1746
1747         if ( $#init_data >= 0 ) {
1748           push( @{$self -> {'individuals'}},
1749                 data::individual -> new ( idcolumn     => $idcol,
1750                                           subject_data => \@init_data ) );
1751         }
1752         ui -> print( category => 'scm',
1753                      message  => " ... done" );
1754         close(DATAFILE);
1755 #       $self -> _write( filename => 'test.dta' );
1756       }
1757 end _read_individuals
1758
1759 # }}} _read_individuals
1760