lib/data_subs.pm

   1 This file contains the code for the subroutines in
   2 Perl-speaks-NONMEMs data module. It is not functional by itself.
   3 The code should be transferred to the module autogenerated by dia2code
   4 using the fill_diacode.pl script.
   5
   6 # {{{ include
   7
   8 start include statements
   9 use Digest::MD5 'md5_hex';
  10 use OSspecific;
  11 use File::Copy "cp";
  12 use Carp;
  13 use Carp qw(cluck);
  14 use Config;
  15 use Math::Random;
  16 use Storable;
  17 use debug;
  18 use ui;
  19 use status_bar;
  20 use Data::Dumper;
  21 use Time::HiRes qw(gettimeofday);
  22 my @primary_column_names = ('ID', 'DATE', 'DAT1', 'DAT2', 'DAT3' ,'L1', 'L2', 'DV', 'MDV', 'RAW_', 'MRG_', 'RPT_', 'TIME', 'DROP', 'SKIP', 'EVID', 'AMT', 'RATE', 'SS', 'II', 'ADDL', 'CMT', 'PCMT', 'CALL');
  23 end include
  24
  25 # }}} include statements
  26
  27 # {{{ description
  28
  29 start description
  30     # The structure of the data class is subject-centric, recognising that
  31     # the subjects included in a study often can be regarded as
  32     # independent. A class for the subject level exists within PsN and is
  33     # called the individual class. A data object consists of at least one
  34     # but probably many individual objects plus optional comments.
  35 end description
  36
  37 # }}} description
  38
  39 # {{{ synopsis
  40
  41 start synopsis
  42     #   use data;
  43     #
  44     #   my $data_obj = data -> new ( filename => 'test040314.dta' );
  45     #
  46     #   $data_obj -> renumber_ascending;
  47     #
  48     #   my $subsets_ref = $data_obj -> case_deletion( bins => 10 );
  49     #
  50     #   my @subsets = @{$subsets_ref};
  51 end synopsis
  52
  53 # }}} synopsis
  54
  55 # {{{ see_also
  56
  57 start see_also
  58     # =begin html
  59     #
  60     # <a HREF="model.html">model</a>, <a HREF="output.html">output</a>,
  61     # <a HREF="tool/modelfit.html">tool::modelfit</a>,
  62     # <a HREF="tool.html">tool</a>
  63     #
  64     # =end html
  65     #
  66     # =begin man
  67     #
  68     # model, output, tool::modelfit, tool
  69     #
  70     # =end man
  71 end see_also
  72
  73 # }}} see_also
  74
  75 # {{{ new
  76
  77 start new
  78       {
  79         # If the column holding the subject identifier is not the
  80         # first, it can be specified using the I<idcolumn> attribute
  81         #
  82         # I<ignoresign> determines which rows that are regarded as
  83         # comments. Corresponds to the IGNORE= option in the $DATA
  84         # record in a NONMEM model file.
  85
  86         $this -> {'use_data_table'} = 0;
  87
  88         ( $this -> {'directory'},
  89           $this -> {'filename'} ) = OSspecific::absolute_path( $this -> {'directory'},
  90                                                                $this->{'filename'} );
  91
  92         debug -> warn( level => 2,
  93                        message => "data -> new: Data object initialized from file: ".
  94                        $this -> full_name );
  95
  96 #       sub register_in_database {
  97 #         my $this = shift;
  98 #         # Backslashes messes up the sql syntax
  99 #         my $file_str = $this->{'filename'};
 100 #         my $dir_str = $this->{'directory'};
 101 #         $file_str =~ s/\\/\//g;
 102 #         $dir_str =~ s/\\/\//g;
 103
 104 #         # md5sum
 105 #         my $md5sum = md5_hex(OSspecific::slurp_file($this-> full_name ));
 106 #         my $dbh = DBI ->
 107 #             connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
 108 #                     ";databse=".$PsN::config -> {'_'} -> {'project'},
 109 #                     $PsN::config -> {'_'} -> {'user'},
 110 #                     $PsN::config -> {'_'} -> {'password'},
 111 #                     {'RaiseError' => 1});
 112 #         my $sth;
 113 #         my $sth = $dbh -> prepare( "SELECT data_id FROM ".$PsN::config -> {'_'} -> {'project'}.
 114 #                              ".data ".
 115 #                                    "WHERE filename = '$file_str' AND ".
 116 #                                    "directory = '$dir_str' AND ".
 117 #                                    "md5sum = '".$md5sum."'" );
 118 #         $sth -> execute or debug -> die( message => $sth->errstr ) ;
 119 #         my $select_arr = $sth -> fetchall_arrayref;
 120 #         if ( scalar @{$select_arr} > 0 ) {
 121 #           debug -> warn( level   => 1,
 122 #                          message => "Found an old entry in the database matching the ".
 123 #                          "current data file" );
 124 #           if ( scalar @{$select_arr} > 1 ) {
 125 #             debug -> warn( level   => 1,
 126 #                            message => "Found more than one matching entry in database".
 127 #                            ", using the first" );
 128 #           }
 129 #           $this -> {'data_id'} = $select_arr->[0][0];
 130 #         } else {
 131 #           my ( $date_str, $time_str );
 132 #           if ( $Config{osname} eq 'MSWin32' ) {
 133 #             $date_str = `date /T`;
 134 #             $time_str = ' '.`time /T`;
 135 #           } else {
 136 #             # Assuming UNIX
 137 #             $date_str = `date`;
 138 #           }
 139 #           chomp($date_str);
 140 #           chomp($time_str);
 141 #           my $date_time = $date_str.$time_str;
 142 #           $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
 143 #                              ".data (filename,date,directory,md5sum) ".
 144 #                                  "VALUES ('$file_str', '$date_time', '$dir_str','".
 145 #                                  $md5sum."' )");
 146 #           $sth -> execute;
 147 #           $this -> {'data_id'} = $sth->{'mysql_insertid'};
 148 #         }
 149 #         $sth -> finish;
 150 #         $dbh -> disconnect;
 151 #         }
 152
 153
 154         unless ( ( defined $this -> {'header'} and
 155                    scalar @{$this -> {'header'}} > 0 ) or
 156                  ( defined $this -> {'individuals'} and
 157                    scalar @{$this -> {'individuals'}} > 0 ) ) {
 158           if ( -e $this -> full_name ) {
 159             if ( $this -> {'target'} eq 'mem' ) {
 160 #             &register_in_database( $this ) if ( $PsN::config -> {'_'} -> {'use_database'} and
 161 #                                                 $this -> {'use_data_table'} );
 162               $this -> _read_header;
 163               $this -> _read_individuals;
 164               $this -> {'synced'} = 1;
 165             } else {
 166               $this -> {'synced'} = 0;
 167             }
 168           } else {
 169             debug -> die(message => "No header, individuals, and no file " . $this -> full_name . " on disk.")
 170               unless $this -> {'ignore_missing_files'};
 171             $this ->  {'synced'} = 0;
 172           }
 173         } else {
 174           if ( $this -> {'target'} eq 'mem') {
 175             if ( -e $this -> {'filename'} ) {
 176               $this -> _read_header;
 177 #             &register_in_database if ( $PsN::config -> {'_'} -> {'use_database'} and
 178 #                                        $this -> {'use_data_table'} );
 179               $this -> _read_individuals;
 180               $this -> {'synced'} = 1;
 181             } else {
 182               debug -> die(message => "No file:".$this->{'filename'}." on disk" )
 183                   unless $this -> {'ignore_missing_files'};
 184               $this -> {'synced'} = 0;
 185             }
 186           } else {
 187             $this -> flush;
 188           }
 189         }
 190
 191         if ( $this -> {'synced'} ) {
 192           my $i = 1;
 193           foreach my $head ( @{$this -> {'header'}} ) {
 194             $this -> {'column_head_indices'} -> {$head} = $i;
 195             $i++;
 196           }
 197         }
 198 #       $Data::Dumper::Maxdepth = 3;
 199 #       die Dumper $this -> {'individuals'};
 200       }
 201 end new
 202
 203 # }}} new
 204
 205 # {{{ register_in_database
 206 start register_in_database
 207     if ( $PsN::config -> {'_'} -> {'use_database'} ) {
 208       # Backslashes messes up the sql syntax
 209       my $file_str = $self->{'filename'};
 210       my $dir_str = $self->{'directory'};
 211       $file_str =~ s/\\/\//g;
 212       $dir_str =~ s/\\/\//g;
 213
 214       my $project = $PsN::config -> {'_'} -> {'project'};
 215       # md5sum
 216       my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
 217
 218       my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
 219                                ";databse=".$project,
 220                                $PsN::config -> {'_'} -> {'user'},
 221                                $PsN::config -> {'_'} -> {'password'},
 222                                {'RaiseError' => 1});
 223
 224       my $sth;
 225
 226       my $select_arr = [];
 227
 228       if ( not $force ) {
 229         my $sth = $dbh -> prepare( "SELECT data_id FROM ".$project.
 230                                    ".data ".
 231                                    "WHERE filename = '$file_str' AND ".
 232                                    "directory = '$dir_str' AND ".
 233                                    "md5sum = '".$md5sum."'" );
 234         $sth -> execute or debug -> die( message => $sth->errstr ) ;
 235         $select_arr = $sth -> fetchall_arrayref;
 236       }
 237
 238       if ( scalar @{$select_arr} > 0 ) {
 239         'debug' -> warn( level   => 1,
 240                          message => "Found an old entry in the database matching the ".
 241                          "current data file" );
 242         if ( scalar @{$select_arr} > 1 ) {
 243           'debug' -> warn( level   => 1,
 244                            message => "Found more than one data matching entry in database".
 245                            ", using the first" );
 246         }
 247         $self -> {'data_id'} = $select_arr->[0][0];
 248         # Find the id's
 249         my $sth = $dbh -> prepare( "SELECT individual_id FROM ".$project.".data_individual ".
 250                                    "WHERE data_id = '".$self -> {'data_id'}."'" );
 251         $sth -> execute or debug -> die( message => $sth->errstr ) ;
 252         my $id_arr = $sth -> fetchall_arrayref;
 253         map( $_ = $_ -> [0], @{$id_arr} );
 254         $self -> {'individual_ids'} = $id_arr;
 255       } elsif ( defined $self -> {'individuals'} ) {
 256         my ( $date_str, $time_str );
 257         if( $Config{osname} eq 'MSWin32' ){
 258           $date_str = `date /T`;
 259           $time_str = ' '.`time /T`;
 260         } else {
 261           # Assuming UNIX
 262           $date_str = `date`;
 263         }
 264         chomp($date_str);
 265         chomp($time_str);
 266         my $date_time = $date_str.$time_str;
 267         my ( $columns, $values );
 268         my $res_str = $resampled ? '1' : '0';
 269         if ( defined $model_id ) {
 270           $columns = '(model_id, filename, date, directory, md5sum, resampled)';
 271           $values = "('$model_id', '$file_str', '$date_time', '$dir_str','".
 272               $md5sum."', '$res_str' )";
 273         } else {
 274           $columns = '(filename, date, directory, md5sum, resampled)';
 275           $values = "('$file_str', '$date_time', '$dir_str','".$md5sum."', '$res_str' )";
 276         }
 277         $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
 278                                ".data $columns VALUES $values");
 279         $sth -> execute;
 280         $self -> {'data_id'} = $sth->{'mysql_insertid'};
 281
 282         if ( defined $self -> {'data_id'} ) {
 283           my $values;
 284           my $columns = "( id_key, id )";
 285           if( $#individual_ids >= 0 ) {
 286             $self -> register_di_relation( individual_ids => \@individual_ids );
 287           } else {
 288             my $inds = scalar @{$self -> {'individuals'}};
 289             $dbh -> do( "LOCK TABLES ".$PsN::config -> {'_'} -> {'project'}.
 290                         ".individual WRITE" );
 291 #           $sth = $dbh -> prepare( "SELECT MAX(individual_id)".
 292 #                                   " FROM ".$PsN::config -> {'_'} -> {'project'}.
 293 #                                   ".individual" );
 294             $dbh -> do( 'USE '.$PsN::config -> {'_'} -> {'project'} );
 295             $sth = $dbh -> prepare( "SHOW TABLE STATUS LIKE 'individual'" );
 296             $sth -> execute or debug -> die( message => $sth->errstr ) ;
 297             my $select_arr = $sth -> fetchall_arrayref;
 298             my $first_id_id = $select_arr -> [0][10] ?
 299                 $select_arr -> [0][10] : 0;
 300 #           my $first_id_id = $select_arr -> [0][0] ? ($select_arr -> [0][0] + 1) : 0;
 301             my $last_id_id = $first_id_id + $inds - 1;
 302             for( my $i = 0; $i < $inds; $i++ ) {
 303               if( defined $self -> {'individuals'}[$i] ) {
 304                 my $id_id = $self -> {'individuals'}[$i] -> idnumber;
 305                 $values = $values."," if ( defined $values );
 306                 $values = $values."( $i, $id_id )";
 307               }
 308             }
 309             $sth = $dbh -> prepare( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
 310                                     ".individual $columns VALUES $values" );
 311             $sth -> execute;
 312             $dbh -> do( "UNLOCK TABLES" );
 313             @individual_ids = ($first_id_id .. $last_id_id);
 314             $self -> register_di_relation( individual_ids => \@individual_ids );
 315           }
 316           $self -> {'individual_ids'} = \@individual_ids;
 317         }
 318         $sth -> finish;
 319         $dbh -> disconnect;
 320       }
 321       $data_id = $self -> {'data_id'}; # return the data_id
 322     }
 323 end register_in_database
 324 # }}} register_in_database
 325
 326 # {{{ register_di_relation
 327 start register_di_relation
 328   if ( $PsN::config -> {'_'} -> {'use_database'} and
 329        defined $self -> {'data_id'} and $#individual_ids >= 0 ) {
 330     my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
 331                              ";databse=".$PsN::config -> {'_'} -> {'project'},
 332                              $PsN::config -> {'_'} -> {'user'},
 333                              $PsN::config -> {'_'} -> {'password'},
 334                              {'raiseerror' => 1});
 335     my $sth;
 336     my $values;
 337     my $columns = "( data_id, individual_id )";
 338     foreach my $individual_id ( @individual_ids ) {
 339       if ( defined $individual_id ) {
 340         $values = $values."," if ( defined $values );
 341         $values = $values."(".$self -> {'data_id'}.", $individual_id )";
 342       }
 343     }
 344     $sth = $dbh -> prepare( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
 345                                ".data_individual $columns VALUES $values" );
 346     $sth -> execute;
 347     $sth -> finish if ( defined $sth );
 348     $dbh -> disconnect;
 349   }
 350 end register_di_relation
 351 # }}} register_di_relation
 352
 353 # {{{ full_name
 354
 355 start full_name
 356       {
 357         $full_name = $self -> {'directory'} . $self -> {'filename'};
 358       }
 359 end full_name
 360
 361 # }}}
 362
 363 # {{{ bootstrap
 364
 365 start bootstrap
 366       {
 367         # The bootstrap method draws I<samples> number of boostrap
 368         # samples from the data set. The I<subjects> arguments
 369         # determines the size of each sample (default equals to the
 370         # number of individuals in the original data set). The method
 371         # returns references to three arrays: I<boot_samples_ref>,
 372         # which holds the bootstrap data sets, I<incl_individuals_ref>
 373         # which holds arrays containing the subject identifiers (ID's)
 374         # for the included individuals of each bootstrap data set and
 375         # I<included_keys_ref> which holds the key or index of the
 376         # included individuals. The key or index is an integer
 377         # starting at 1 for the first individual in the original data
 378         # set and increasing by one for each following.
 379         $self -> synchronize;
 380         my @header      = @{$self -> {'header'}};
 381         my $individuals = $self -> {'individuals'};
 382         my $key_ref;
 383
 384         my $status_bar = status_bar -> new( steps => $samples );
 385         ui -> print( category => 'bootstrap',
 386                      message => $status_bar -> print_step,
 387                      newline => 0);
 388
 389         for ( my $i = 1; $i <= $samples; $i++ ) {
 390           my $new_name = defined $name_stub ? $name_stub."_$i.dta" : "bs$i.dta";
 391           $new_name = $directory.'/'.$new_name;
 392           my ( $boot, $incl_ind_ref, $incl_key_ref ) =
 393             $self -> resample( subjects    => $subjects,
 394                                resume      => $resume,
 395                                new_name    => $new_name,
 396                                target      => $target,
 397                                stratify_on => $stratify_on,
 398                                model_id    => $model_ids[$i-1] );
 399           push( @included_keys, $incl_key_ref );
 400           push( @incl_individuals, $incl_ind_ref );
 401 #         $boot -> renumber_ascending;
 402           push( @boot_samples, $boot );
 403 #          $boot -> synchronize;
 404 #          $boot -> flush;
 405           if( $status_bar -> tick() ){
 406             ui -> print( category => 'bootstrap',
 407                          message => $status_bar -> print_step,
 408                          newline => 0,
 409                          wrap => 0);
 410           }
 411 #         print Dumper \@boot_samples;
 412 #         sleep(10);
 413         }
 414         ui -> print( category => 'bootstrap',
 415                      message => ' ... done' );
 416       }
 417 end bootstrap
 418
 419 # }}} bootstrap
 420
 421 # {{{ resample
 422
 423 start resample
 424       {
 425         $self -> synchronize;
 426         my ( @header, $individuals, @bs_inds, $key_ref, @id_ids, @bs_id_ids );
 427         @id_ids = @{$self -> {'individual_ids'}} if( defined $self -> {'individual_ids'} );
 428         if ( defined $stratify_on ) {
 429           unless ( $resume and -e $new_name ) {
 430             @header = @{$self -> {'header'}};
 431             $individuals = $self -> {'individuals'};
 432             my %strata;
 433             if( $stratify_on =~ /\D/ ){
 434               %strata = %{$self -> factors( column_head => $stratify_on )};
 435               if ( $strata{'Non-unique values found'} eq '1' ) {
 436                 debug -> die( message => "Individuals were found to have multiple values in the $stratify_on column. ".
 437                               "The column $stratify_on cannot be used for stratification of the resampling." );
 438               }
 439             } else {
 440               %strata = %{$self -> factors( column => $stratify_on )};
 441               if ( $strata{'Non-unique values found'} eq '1' ) {
 442                 debug -> die( message => "Individuals were found to have multiple values in column number $stratify_on. ".
 443                               "Column $stratify_on cannot be used for stratification of the resampling." );
 444               }
 445             }
 446
 447             while( my ( $factor, $key_list ) = each %strata ) {
 448               my $keys = scalar @{$key_list};
 449               for ( my $i = 0; $i < $keys; $i++ ) {
 450                 my $list_ref = random_uniform_integer(1,0,$keys-1);
 451                 push( @bs_inds, $individuals ->
 452                       [ $key_list -> [$list_ref] ] -> copy );
 453                 push( @included_keys, $key_list -> [$list_ref] );
 454                 push( @incl_individuals, $individuals ->
 455                       [ $key_list -> [$list_ref] ] -> idnumber );
 456                 push( @bs_id_ids, $id_ids[ $key_list -> [$list_ref] ] );
 457               }
 458             }
 459
 460             $boot = data -> new( header      => \@header,
 461                                  idcolumn    => $self -> {'idcolumn'},
 462                                  ignoresign  => $self -> {'ignoresign'},
 463                                  individuals => \@bs_inds,
 464                                  filename    => $new_name,
 465                                  ignore_missing_files => 1,
 466                                  target      => 'mem' );
 467             $boot -> renumber_ascending;
 468             $boot -> _write;
 469             $boot -> flush;
 470             #$boot -> target( $target );
 471           }
 472         } else {
 473           unless ( $resume and -e $new_name ) {
 474             @header = @{$self -> {'header'}};
 475             $individuals = $self -> {'individuals'};
 476             for ( my $i = 1; $i <= $subjects; $i++ ) {
 477               $key_ref = random_uniform_integer(1,0,scalar @{$individuals}-1);
 478               push( @bs_inds, $individuals -> [ $key_ref ] -> copy );
 479               push( @included_keys, $key_ref );
 480               push( @incl_individuals, $individuals -> [ $key_ref ] -> idnumber );
 481               push( @bs_id_ids, $id_ids[ $key_ref ] );
 482             }
 483
 484             # MUST FIX: If a file already exists with the same name,
 485             # the created bs data set will be appended to this. IT
 486             # MUST BE OVERWRITTEN!
 487             $boot = data -> new( header      => \@header,
 488                                  idcolumn    => $self -> {'idcolumn'},
 489                                  ignoresign  => $self -> {'ignoresign'},
 490                                  individuals => \@bs_inds,
 491                                  filename    => $new_name,
 492                                  ignore_missing_files => 1,
 493                                  target      => 'mem' );
 494             $boot -> renumber_ascending;
 495             $boot -> _write;
 496             $boot -> target( $target );
 497           } else {
 498             # If we are resuming, we still need to generate the
 499             # pseudo-random sequence and initiate a data object
 500             for ( my $i = 1; $i <= $subjects; $i++ ) {
 501               random_uniform_integer(1,0,scalar @{$individuals}-1)
 502             }
 503             $boot = data -> new( idcolumn    => $self -> {'idcolumn'},
 504                                  ignoresign  => $self -> {'ignoresign'},
 505                                  filename    => $new_name,
 506                                  ignore_missing_files => 1,
 507                                  target      => $target );
 508             $boot -> _write;
 509             $boot -> flush;
 510           }
 511           if( $target eq 'disk'){
 512             $boot -> flush;
 513           }
 514         }
 515         $boot -> register_in_database( individual_ids => \@bs_id_ids,
 516                                        resampled      => 1,
 517                                        model_id       => $model_id );
 518       }
 519 end resample
 520
 521 # }}} resample
 522
 523 # {{{ case_deletion
 524
 525 start case_deletion
 526       {
 527         # case_deletion creates subsets of the data. The number of
 528         # subsets is specified by the bins argument. The individuals
 529         # of each subset is selected randomly or in ascending
 530         # numerical order depending on the selection argument that can
 531         # be either 'consecutive' or 'random'. case_column must be
 532         # specified to give the method something to base the selection
 533         # on. Valid case_column values are either the column number
 534         # (pure digits) or the name of the column in the (optional)
 535         # header row.
 536         $self -> synchronize;
 537         my @header    = @{$self -> {'header'}};
 538         if ( not defined $case_column ) {
 539           debug -> die( message => "case_column must be specified" );
 540         } else {
 541           if ( not $case_column =~ /^\d/ ) {
 542             for ( my $i = 0; $i <= $#header; $i++ ) {
 543               $case_column = $i+1 if ( $header[$i] eq $case_column );
 544             }
 545           }
 546         }
 547         $bins = defined $bins ? $bins :
 548           scalar keys %{$self -> factors( column => $case_column)};
 549         my %factors   = %{$self -> factors( column => $case_column )};
 550         if ( $factors{'Non-unique values found'} eq '1' ) {
 551           debug -> die( message => "Individuals were found to have multiple values in column number $case_column. ".
 552                         "Column $case_column cannot be used for case deletion." );
 553         }
 554
 555         my $maxbins   = scalar keys %factors;
 556         my @ftrs      = sort { $a <=> $b } keys %factors;
 557         my $individuals = $self -> {'individuals'};
 558         my $maxkey    = scalar @{$individuals} - 1;
 559
 560         my ( @tmp_ftrs, @binsize ) =
 561             ((),());
 562         my ( $k, $j, $i ) = ( 0, 0, 0 );
 563         # Create the binsizes
 564         for ( $j = 0; $j < $maxbins; $j++ ) {
 565           $binsize[ $k++ ]++;
 566           $k = 0 if( $k >= $bins );
 567         }
 568         $self -> _fisher_yates_shuffle( array => \@ftrs ) if( $selection eq 'random' );
 569         for ( $k = 0; $k < $bins; $k++ ) {
 570           for ( $j = 0; $j < $binsize[ $k ]; $j++ ) {
 571 #           print "SK: ",$skipped_keys[ $k ]," F: ",$factors{ $ftrs[ $i ] },"\n";
 572             push( @{$skipped_keys[ $k ]}, @{$factors{ $ftrs[ $i ] }} );
 573             push( @{$skipped_values[ $k ]}, $ftrs[ $i++ ] );
 574           }
 575         }
 576
 577         for ( $k = 0; $k < $bins; $k++ ) {
 578           my @cd_inds = ();
 579           my @del_inds = ();
 580         SELKEYS: foreach my $key ( 0..$maxkey ) {
 581           foreach my $skipped ( @{$skipped_keys[ $k ]} ) {
 582             if ( $key == $skipped ) {
 583               push( @{$skipped_ids[ $k ]}, $individuals ->
 584                     [ $skipped ] -> idnumber );
 585               push( @del_inds, $individuals -> [ $key ] -> copy );
 586               next SELKEYS;
 587             }
 588           }
 589           push( @cd_inds, $individuals -> [ $key ] -> copy );
 590         }
 591           # Set ignore_missing_files = 1 to make it possible to get the result
 592           # in memory only
 593           my $newdata = data ->
 594               new ( header      => \@header,
 595                     ignoresign  => $self -> {'ignoresign'},
 596                     idcolumn    => $self -> {'idcolumn'},
 597                     individuals => \@cd_inds,
 598                     target      => $target,
 599                     filename    => $directory.'/cdd_'.($k+1).'.dta',
 600                     ignore_missing_files => 1 );
 601           my $deldata = data ->
 602               new ( header      => \@header,
 603                     ignoresign  => $self -> {'ignoresign'},
 604                     idcolumn    => $self -> {'idcolumn'},
 605                     individuals => \@del_inds,
 606                     target      => $target,
 607                     filename    => $directory.'/rem_'.($k+1).'.dta',
 608                     ignore_missing_files => 1 );
 609           push( @subsets, $newdata );
 610           push( @remainders, $deldata );
 611           $newdata -> _write;
 612           $newdata -> flush;
 613           $deldata -> _write;
 614           $deldata -> flush;
 615         }
 616       }
 617 end case_deletion
 618
 619 # }}} case_deletion
 620
 621 # {{{ copy
 622 start copy
 623       {
 624         # filename: new data file name.
 625         #
 626         # target: keep the copy in memory ('mem') or write it to disk and flush the memory ('disk').
 627
 628         ($directory, $filename) = OSspecific::absolute_path( $directory, $filename );
 629
 630         # Clone self into new data object. Why don't the individuals get cloned too?
 631         # strange. need to set synced to 0 AND set the {'individuals'} to undef.
 632         cp($self -> full_name, $directory.$filename );
 633         $new_data = Storable::dclone( $self );
 634         $new_data -> {'synced'} = 0;
 635         $new_data -> {'individuals'} = undef;
 636         $new_data -> synchronize;
 637
 638         # Set the new file name for the copy
 639         $new_data -> directory( $directory );
 640         $new_data -> filename( $filename );
 641       }
 642 end copy
 643
 644 # }}} copy
 645
 646 # {{{ column_to_array
 647 start column_to_array
 648 {
 649   $self -> synchronize;
 650
 651   if ( not $column =~ /^\d/ ) {
 652     $column = $self -> {'column_head_indices'} -> {$column} - 1;
 653   }
 654
 655   if( $column < 0 or $column > $#{$self -> {'header'}} ){
 656     return [];
 657   }
 658
 659   foreach my $individual ( @{$self -> individuals} ){
 660     foreach my $individual_row( @{$individual -> subject_data} ){
 661       my @row = split(/,/ , $individual_row);
 662       push( @array, $row[$column] );
 663     }
 664   }
 665 }
 666 end column_to_array
 667 # }}}
 668
 669 # {{{ count_ind
 670
 671 start count_ind
 672       {
 673         # Returns the number of individuals in the data set.
 674         $self -> synchronize;
 675         $num = scalar @{$self -> {'individuals'}};
 676       }
 677 end count_ind
 678
 679 # }}} count_ind
 680
 681 # {{{ diff
 682 start diff
 683 {
 684   $self -> synchronize;
 685
 686   my $first_id = $self -> {'individuals'}[0];
 687
 688   debug -> die( message => "No individuals defined in data object based on ".
 689                 $self -> full_name ) unless ( defined $first_id );
 690
 691   # Check if $column(-index) is defined and valid, else try to find index
 692   # using column_head
 693
 694   my @data_row = split( /,/, $first_id -> subject_data -> [0] );
 695   if( $#columns >= 0 ) {
 696     foreach my $column ( @columns ) {
 697       unless ( defined $column && defined( $data_row[$column-1] ) ) {
 698         debug -> die( message => "Error in data -> factors: ".
 699                       "invalid column number: \"$column\"\n".
 700                       "Valid column numbers are 1 to ".
 701                       scalar @{$first_id -> subject_data ->[0]}."\n" );
 702       }
 703     }
 704   } elsif ( $#column_heads >= 0 ) {
 705     foreach my $column_head ( @column_heads ) {
 706       unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
 707         debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
 708                       "Valid column headers are (in no particular order):\n".
 709                       join(', ',keys(%{$self -> {'column_head_indices'}})) );
 710       } else {
 711         my $column = $self -> {'column_head_indices'}{$column_head};
 712         push( @columns, $column );
 713         debug -> warn( level   => 2,
 714                        message => "$column_head is in column number $column" );
 715       }
 716     }
 717   } else {
 718     debug -> die( message => "No column or column_head defined" );
 719   }
 720
 721   if( $global_largest or $global_smallest or
 722       $largest_per_individual or $smallest_per_individual ) {
 723     if( not scalar @{$self -> {'individuals'}} == scalar @{$against_data -> individuals} ) {
 724       debug -> die( message => "Both data object must hold the same number of individuals ".
 725                     "and observations when calling data -> diff" );
 726     }
 727     for( my $i = 0; $i < scalar @{$self -> {'individuals'}}; $i++ ) {
 728       my %id_diffs = %{$self -> {'individuals'}[$i] ->
 729                            diff( against_individual => $against_data -> individuals -> [$i],
 730                                  columns            => \@columns,
 731                                  absolute_diff      => $absolute_diff,
 732                                  diff_as_fraction   => $diff_as_fraction,
 733                                  largest            => ( $global_largest or $largest_per_individual ),
 734                                  smallest           => ( $global_smallest or $smallest_per_individual ) )};
 735       if( $global_largest ) {
 736         for( my $j = 0; $j <= $#columns; $j++ ) {
 737           my $label = defined $column_heads[$j] ? $column_heads[$j] : $columns[$j];
 738           if( not defined $diff_results{$label} or not defined $diff_results{$label}{'diff'} or
 739               $id_diffs{$columns[$j]}{'diff'} > $diff_results{$label}{'diff'} ) {
 740             $diff_results{$label}{'diff'} = $id_diffs{$columns[$j]}{'diff'};
 741             $diff_results{$label}{'self'} = $id_diffs{$columns[$j]}{'self'};
 742             $diff_results{$label}{'test'} = $id_diffs{$columns[$j]}{'test'};
 743           }
 744         }
 745       }
 746     }
 747   } else {
 748     die "data -> diff is only implemented for finding the largest difference at any observation at this point\n";
 749   }
 750 }
 751 end diff
 752 # }}} diff
 753
 754 # {{{ filename
 755 start filename
 756       {
 757         if ( defined $parm and $parm ne $self -> {'filename'} ) {
 758           $self -> {'filename'} = $parm;
 759           $self -> {'data_id'} = undef;
 760 #         $self -> _write;
 761         }
 762       }
 763 end filename
 764 # }}} filename
 765
 766 # {{{ fractions
 767
 768 start fractions
 769       {
 770         my %factors = $self -> factors( 'return_occurences' => 1,
 771                                         'unique_in_individual' => $unique_in_individual,
 772                                         'column_head' => $column_head,
 773                                         'column' => $column);
 774
 775         my $sum = 0;
 776         while (my ($factor, $amount) = each %factors) {
 777           if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
 778             next;
 779           } else {
 780             $sum += $amount;
 781           }
 782         }
 783         while (my ($factor, $amount) = each %factors) {
 784           if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
 785             next;
 786           } else {
 787             $fractions{$factor} = $amount/$sum;
 788           }
 789         }
 790       }
 791 end fractions
 792
 793 # }}} fractions
 794
 795 # {{{ factors
 796
 797 start factors
 798       {
 799         # Either column (number, starting at 1) or column_head must be specified.
 800         #
 801         # The default behaviour is to return a hash with the factors as keys
 802         # and as values references to arrays with the order numbers (not the ID numbers)
 803         # of the individuals that contain this factor
 804         #
 805         # If unique_in_individual is true (1), the returned hash will contain
 806         # an element with key 'Non-unique values found' and value 1 if any
 807         # individual contain more than one value in the specified column.
 808         #
 809         # Return occurences will calculate the occurence of each
 810         # factor value. Several occurences in one individual counts as
 811         # one occurence. The elements of the returned hash will have the factors
 812         # as keys and the number of occurences as values.
 813         #
 814
 815         $self -> synchronize;
 816
 817         # Check if $column(-index) is defined and valid, else try to find index
 818         # using column_head
 819         my $first_id = $self -> {'individuals'}[0];
 820
 821         debug -> die( message => "No individuals defined in data object based on ".
 822                       $self -> full_name ) unless ( defined $first_id );
 823
 824         my @data_row = split( /,/, $first_id -> subject_data -> [0] );
 825         unless ( defined $column && defined( $data_row[$column-1] ) ) {
 826           unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
 827             debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
 828                           "or invalid column number: \"$column\".\n".
 829                           "Valid column numbers are 1 to ".scalar @data_row ."\n".
 830                           "Valid column headers are (in no particular order):\n".
 831                           join(', ',keys(%{$self -> {'column_head_indices'}})) );
 832           } else {
 833             $column = $self -> {'column_head_indices'}{$column_head};
 834             debug -> warn( level   => 2,
 835                            message => "$column_head is in column number $column" );
 836           }
 837         }
 838
 839         my $key = 0;
 840         foreach my $individual ( @{$self -> {'individuals'}} ) {
 841           my @ifactors = keys %{$individual -> factors( column => $column )};
 842           if ( scalar @ifactors > 1 and $unique_in_individual ) {
 843             %factors = ( 'Non-unique values found' => 1 );
 844             last;
 845           }
 846           debug -> die( message => "No value found in column $column in individual ".
 847                         $individual -> idnumber ) if ( scalar @ifactors == 0 );
 848
 849           # Return occurences will calculate the occurence of each
 850           # factor value. Several occurences in one individual counts as
 851           # one occurence.
 852
 853           if ( $return_occurences ) {
 854             foreach my $ifactor ( @ifactors ) {
 855               $factors{$ifactor}++;
 856             }
 857           } else {
 858             foreach my $ifactor ( @ifactors ) {
 859               push( @{$factors{$ifactor}}, $key );
 860             }
 861           }
 862           $key++;
 863         }
 864       }
 865 end factors
 866
 867 # }}} factors
 868
 869 # {{{ find_individual
 870
 871 # start find_individual
 872 #         foreach my $tmp_ind ( @{$self -> individuals} ) {
 873 #         if ( $tmp_ind -> key == $key ) {
 874 #           $individual = $tmp_ind;
 875 #           last;
 876 #         }
 877 #       }
 878 #         if ( defined $individual ) {
 879 #         if ( $copy ) {
 880 #           $individual = $individual -> copy;
 881 #         }
 882 #       } else {
 883 #         print "No individual with key $key found in call to ".
 884 #           "data -> find_individual\n" if ( $self -> debug );
 885 #         }
 886 # end find_individual
 887
 888 # }}}
 889
 890 # {{{ format_data
 891
 892 start format_data
 893       {
 894         my $header = $self -> {'header'};
 895
 896         # format the data for NONMEM (simple comma-separated layout)
 897         if ( defined $self -> {'comment'} ) {
 898           my @comment   = @{$self -> {'comment'}};
 899           for ( @comment ) {
 900             push( @form_data );
 901           }
 902         }
 903
 904         my $wrap = ( defined $self -> {'wrap_column'} and
 905                      defined $self -> {'cont_column'} );
 906
 907         my @primary_columns = defined $self -> {'primary_columns'} ?
 908           @{$self -> {'primary_columns'}} : ();
 909         my @secondary_columns = defined $self -> {'secondary_columns'} ?
 910           @{$self -> {'secondary_columns'}} : ();
 911         if ( defined $header and defined $self -> {'ignoresign'} ) {
 912           my $istr;
 913           if ( $self -> {'ignoresign'} ne '@' ) {
 914             $istr = $self -> {'ignoresign'};
 915           }
 916           if ( $wrap ) {
 917             my @h_data;
 918             for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
 919               my $sstr = $istr;
 920               for ( my $j = 0; $j < scalar @{$secondary_columns[$i]} ; $j++ ) {
 921                 my $jstr = $j == 0 ? '' : ',';
 922                 $sstr = $sstr.$jstr.$secondary_columns[$i][$j][0];
 923               }
 924               push( @h_data, $sstr."\n" );
 925             }
 926             push( @form_data, @h_data );
 927             my $pstr = $istr;
 928             for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
 929               my $jstr = $i == 0 ? '' : ',';
 930               $pstr = $pstr.$jstr.$primary_columns[$i][0];
 931             }
 932             push( @form_data, $pstr."\n" );
 933           } else {
 934             push( @form_data, $istr.join(',',@{$self -> {'header'}})."\n" );
 935           }
 936         }
 937         if ( $wrap ) {
 938           foreach my $individual ( @{$self -> {'individuals'}} ) {
 939             foreach my $row ( @{$individual -> subject_data} ) {
 940               my @r_data;
 941               for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
 942                 my $sstr = '';
 943                 for ( my $j = 0; $j < scalar @{$secondary_columns[$i]} ; $j++ ) {
 944                   my $jstr = $j == 0 ? '' : ',';
 945                   if ( $secondary_columns[$i][$j][0] eq 'CONT' ) {
 946                     $sstr = $sstr.$jstr.'1';
 947                   } else {
 948                     my @data_row = split( /,/, $row );
 949                     $sstr = $sstr.$jstr.$data_row[$secondary_columns[$i][$j][1]];
 950                   }
 951                 }
 952                 push( @r_data, $sstr."\n" );
 953               }
 954               push( @form_data, @r_data );
 955               my $pstr = '';
 956               for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
 957                 my $jstr = $i == 0 ? '' : ',';
 958                 if ( $primary_columns[$i][0] eq 'CONT' ) {
 959                   $pstr = $pstr.$jstr.'0';
 960                 } else {
 961                   my @data_row = split( /,/, $row );
 962                   $pstr = $pstr.$jstr.$data_row[$primary_columns[$i][1]];
 963                 }
 964               }
 965               push( @form_data, $pstr."\n" );
 966             }
 967           }
 968         } else {
 969           foreach my $individual ( @{$self -> {'individuals'}} ) {
 970             foreach my $row ( @{$individual -> subject_data} ) {
 971               push( @form_data, $row ."\n" );
 972             }
 973           }
 974         }
 975       }
 976 end format_data
 977
 978 # }}} format_data
 979
 980 # {{{ drop_dropped
 981
 982 start drop_dropped
 983       {
 984         # This method removes columns that has '=DROP' value in the
 985         # model header as given by $INPUT. The model header must be
 986         # transfered to this method through the model_header
 987         # argument. The model_header argument should be a
 988         # two-dimensional array where each position in the first
 989         # dimension should be a reference to a 1*2 array holding the
 990         # column name and value. Any ignore-sign must be removed.
 991
 992         debug -> die( message => 'model header must be defined' )
 993           if ( $#model_header < 0 );
 994         # Important that the drop_dropped method of the model::problem
 995         # class is in sync with this method.
 996         $self -> synchronize;
 997
 998         $self -> {'header'} = [];
 999         my @drop;
1000         my $counter = 1;
1001         for( my $i = 0; $i <= $#model_header; $i++ ) {
1002           $self -> {'idcolumn'} = $counter if ( $model_header[$i][0] eq 'ID' );
1003           if( ( $model_header[$i][1] eq 'DROP' or
1004                 $model_header[$i][1] eq 'SKIP' ) and
1005               not $model_header[$i][0] =~ /DAT(E|1|2|3)/ ) {
1006             push( @drop, 1 );
1007           } else {
1008             $counter++;
1009             push( @drop, 0 );
1010             push( @{$self -> {'header'}}, $model_header[$i][0] );
1011           }
1012         }
1013
1014         foreach my $individual ( @{$self -> {'individuals'}} ) {
1015           $individual -> drop_columns( drop => \@drop );
1016         }
1017
1018         $self -> {'synced'} = 0;
1019 #       $Data::Dumper::Maxdepth = 2;
1020 #       die Dumper $self;
1021 #       die Dumper $self -> {'individuals'};
1022       }
1023 end drop_dropped
1024
1025 # }}} drop_dropped
1026
1027 # {{{ wrap
1028 start wrap
1029       {
1030         $self -> synchronize;
1031         $self -> cont_column( $cont_column ) if ( defined $cont_column );
1032         $self -> wrap_column( $wrap_column ) if ( defined $wrap_column );
1033         $self -> prepare_wrap( model_header => \@model_header );
1034         @secondary_columns = @{$self -> {'secondary_columns'}}
1035           if ( defined $self -> {'secondary_columns'} );
1036         @primary_columns = @{$self -> {'primary_columns'}}
1037           if ( defined $self -> {'primary_columns'} );
1038       }
1039 end wrap
1040 # }}} wrap
1041
1042 # {{{ unwrap
1043 start unwrap
1044       {
1045         $self -> {'cont_column'} = undef;
1046         $self -> {'wrap_column'} = undef;
1047         $self -> {'secondary_columns'} = undef;
1048         $self -> {'primary_columns'} = undef;
1049       }
1050 end unwrap
1051 # }}} unwrap
1052
1053 # {{{ prepare_wrap
1054
1055 start prepare_wrap
1056       {
1057         my $cont_column = $self -> {'cont_column'};
1058         my $wrap_column = $self -> {'wrap_column'};
1059         debug -> die( message => 'cont_column ('.$cont_column.') must be less or equal '.
1060                       'to the requested number of columns in each row ('.
1061                       ($wrap_column).')' )
1062           if ( $cont_column > $wrap_column );
1063         my @header;
1064         if ( scalar @model_header > 0 ) {
1065           @header = @model_header;
1066         } else {
1067           @header = @{$self -> {'header'}};
1068         }
1069
1070         my ( @primary, @secondary, @date_columns );
1071
1072         for ( my $i = 0; $i <= $#header; $i++ ) {
1073           my $name  = ref( $header[$i] ) eq 'ARRAY' ? $header[$i][0] : $header[$i];
1074           my $value = ref( $header[$i] ) eq 'ARRAY' ? $header[$i][1] : undef;
1075           next if ( $name eq 'ID' );
1076           my $found = 0;
1077           foreach my $prim ( @primary_column_names ) {
1078             if ( not $found and
1079                  ( $name eq $prim or $value eq $prim ) ) {
1080               push( @primary, [$name, $i, $value] );
1081               $found = 1;
1082               my $col = ($#primary+2)>= $cont_column ? ($#primary+3) : ($#primary+2);
1083               push( @date_columns, $col ) if ( $name =~ /DAT(E|1|2|3)/ );
1084             }
1085           }
1086           push( @secondary, [$name, $i, $value] ) if ( not $found );
1087         }
1088
1089         my $prim_num = scalar @primary;
1090         debug -> die( message => 'The number of primary columns (that need to '.
1091                       'be part of the row with CONT=0) ('.($prim_num+1).
1092                       ') is larger than the required number of columns (wrap_column='.
1093                       $wrap_column.') - 1' )
1094           if ( scalar $prim_num > ($wrap_column-2) );
1095
1096         my ( $i, $dum ) = ( 0, 1 );
1097         my @tmp;
1098         for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1099           if( $j == 1 ) {
1100             push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1101           } elsif ( $j == $wrap_column ) {
1102             if ( $j == $cont_column ) {
1103               push( @tmp, ['CONT', undef] );
1104             } else {
1105               my $val;
1106               if ( defined $primary[$i] ) {
1107                 $val = $primary[$i];
1108               } elsif ( defined $secondary[0] ) {
1109                 $val = shift(@secondary);
1110               } else {
1111                 $val = ['XX'.$dum++,$self -> {'idcolumn'}-1];
1112               }
1113               push( @tmp, $val );
1114               $i++;
1115             }
1116             push( @{$self -> {'primary_columns'}}, @tmp );
1117           } else {
1118             if ( $j == $cont_column ) {
1119               push( @tmp, ['CONT', undef] );
1120             } else {
1121               if ( $i <= $#primary ) {
1122                 push( @tmp, $primary[$i] );
1123                 $i++;
1124               } else {
1125                 my $val = defined $secondary[0] ? shift(@secondary) :
1126                   ['XX'.$dum++,$self -> {'idcolumn'}-1];
1127                 push( @tmp, $val );
1128               }
1129             }
1130           }
1131         }
1132
1133         my $i = 0;
1134         while ( $i <= $#secondary ) {
1135           my @tmp;
1136           for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1137             if( $j == 1 ) {
1138               push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1139             } elsif ( $j == $wrap_column ) {
1140               if ( $j == $cont_column ) {
1141                 push( @tmp, ['CONT', undef] );
1142               } else {
1143                 my $val = defined $secondary[$i] ? $secondary[$i] :
1144                   ['XX'.$dum++,$self -> {'idcolumn'}-1];
1145                 push( @tmp, $val );
1146                 $i++;
1147               }
1148               unshift( @{$self -> {'secondary_columns'}}, \@tmp );
1149             } else {
1150               if ( $j == $cont_column ) {
1151                 push( @tmp, ['CONT', undef] );
1152               } else {
1153                 my $isdate = 0;
1154                 if ( $#date_columns >= 0 ) {
1155                   foreach my $col ( @date_columns ) {
1156                     # This is a date column which may have to be dropped
1157                     # and thus will not appear as a secondary
1158                     # column. Nothing should be pushed. The indexes in
1159                     # model::problem::pk::_format_record will be ok.
1160                     $isdate = 1 if ( $col == $j ) ;
1161                   }
1162                 }
1163                 if ( $isdate ) {
1164                   push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1165                 } else {
1166                   if ( $i <= $#secondary ) {
1167                     push( @tmp, $secondary[$i] );
1168                     $i++;
1169                   } else {
1170                     push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1171                   }
1172                 }
1173               }
1174             }
1175           }
1176         }
1177       }
1178 end prepare_wrap
1179
1180 # }}} prepare_wrap
1181
1182 # {{{ have_missing_data
1183 start have_missing_data
1184       {
1185         # Either I<column> or I<column_head> must be specified.
1186         #
1187         # This method looks through the data column with index I<column> or
1188         # (optional) header name I<column_head> and returns O if no missing
1189         # data indicator was found or 1 otherwise.
1190
1191         $self -> synchronize;
1192         my $first_id = $self -> {'individuals'}[0];
1193         debug -> die( message => "No individuals defined in data object based on ".
1194                       $self -> full_name ) unless ( defined $first_id );
1195         my @data_row = split( /,/ , $first_id -> subject_data -> [0] );
1196         unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1197           unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1198             die "Error in data -> have_missing_data: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1199           } else {
1200             $column = $self -> {'column_head_indices'}{$column_head};
1201           }
1202         }
1203         $self -> flush if ( $self -> {'target'} eq 'disk' );
1204
1205         # In case anyone wonders, the ternary statment ( bool ? true :
1206         # false ) below will possibly make a minuscle memory
1207         # optimization. But hey, why not :)
1208
1209         $return_value = defined $self -> {'have_missing_data'} ? $self -> {'have_missing_data'} -> {$column} : 0;
1210       }
1211 end have_missing_data
1212 # }}} have_missing_data
1213
1214 # {{{ merge
1215 start merge
1216       {
1217         #$self -> synchronize;
1218         push( @{$self -> {'individuals'}}, @{$mergeobj -> individuals} );
1219       }
1220 end merge
1221 # }}} merge
1222
1223 # {{{ max
1224
1225 start max
1226       {
1227         # Either column or column_head must be specified. Column_head must be a string that
1228         # identifies a column in the (optional ) data file header.
1229
1230 # The if-statement below used to be a cache of allready calculated
1231 # means. But since individuals can be accessed in so many ways, we
1232 # don't know when this cache should be updated. Its easier to
1233 # recalculate the max. Maybe we can include this optimization in the
1234 # future, if it turns out to be a bottleneck
1235 #       my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1236 #       if ( defined $self -> {'max'}[$tmp_column] ) {
1237 #         $return_value = $self -> {'max'}[$tmp_column] ;
1238 #       } else {
1239           $self -> synchronize;
1240           my $first_id = $self -> {'individuals'}[0];
1241           debug -> die( message => "data -> max: No individuals defined in data object based on " .
1242                         $self -> full_name ) unless defined $first_id;
1243
1244           my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1245
1246           unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1247             unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1248               die "Error in data -> max: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1249             } else {
1250               $column = $self -> {'column_head_indices'}{$column_head};
1251             }
1252           }
1253           foreach my $individual ( @{$self -> {'individuals'}} ) {
1254             my $ifactors = $individual -> factors( 'column' => $column );
1255             foreach ( keys %{$ifactors} ) {
1256               next if ( $_ == $self -> {'missing_data_token'} );
1257               if ( defined ($return_value) ) {
1258                 $return_value = $_ > $return_value ? $_ : $return_value;
1259               } else {
1260                 $return_value = $_;
1261               }
1262             }
1263           }
1264
1265 #         $self -> {'max'}[$column] = $return_value;
1266           $self -> flush if ( $self -> {'target'} eq 'disk' );
1267 #       }
1268       }
1269 end max
1270
1271 # }}} max
1272
1273 # {{{ min
1274
1275 start min
1276       {
1277         # See L</max>.
1278         my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1279
1280 # The if-statement below used to be a cache of allready calculated
1281 # means. But since individuals can be accessed in so many ways, we
1282 # don't know when this cache should be updated. Its easier to
1283 # recalculate the min. Maybe we can include this optimization in the
1284 # future, if it turns out to be a bottleneck
1285 #       if ( defined $self -> {'min'}[$tmp_column] ) {
1286 #         $return_value = $self -> {'min'}[$tmp_column] ;
1287 #       } else {
1288           $self -> synchronize;
1289           my $first_id = $self -> {'individuals'}[0];
1290           die "data -> min: No individuals defined in data object based on ",
1291             $self -> full_name,"\n" unless defined $first_id;
1292
1293           my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1294
1295           unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1296             unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1297               die "Error in data -> min: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1298             } else {
1299               $column = $self -> {'column_head_indices'}{$column_head};
1300             }
1301           }
1302           foreach my $individual ( @{$self -> {'individuals'}} ) {
1303             my $ifactors = $individual -> factors( 'column' => $column );
1304             foreach ( keys %{$ifactors} ) {
1305               next if ( $_ == $self -> {'missing_data_token'} );
1306               if ( defined ($return_value) ) {
1307                 $return_value = $_ < $return_value ? $_ : $return_value;
1308               } else {
1309                 $return_value = $_;
1310               }
1311             }
1312           }
1313 #         $self -> {'min'}[$column] = $return_value;
1314           $self -> flush if ( $self -> {'target'} eq 'disk' );
1315 #       }
1316       }
1317 end min
1318
1319 # }}} min
1320
1321 # {{{ median
1322
1323 start median
1324       {
1325           # See L</max>.
1326         $self -> synchronize;
1327         my $first_id = $self -> {'individuals'}[0];
1328         die "data -> median: No individuals defined in data object based on ",
1329           $self -> full_name,"\n" unless defined $first_id;
1330
1331         my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1332
1333         unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1334           unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1335             die "Error in data -> median: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1336           } else {
1337             $column = $self -> {'column_head_indices'}{$column_head};
1338           }
1339         }
1340
1341         if( defined $self -> {'median'}[$column] ){
1342           return $self -> {'median'}[$column];
1343         }
1344
1345         my @median_array;
1346
1347         foreach my $individual ( @{$self -> {'individuals'}} ) {
1348           if( $unique_in_individual ){
1349             my $ifactors = $individual -> factors( 'column' => $column );
1350
1351             foreach ( keys %{$ifactors} ) {
1352               next if ( $_ == $self -> {'missing_data_token'} );
1353               push( @median_array, $_ );
1354             }
1355           } else {
1356             my $ifactors = $individual -> subject_data;
1357
1358             for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1359               my @data_row = split( /,/ , $ifactors -> [$i] );
1360               next if ( $data_row[$column-1] == $self -> {'missing_data_token'} );
1361               push(@median_array, $data_row[$column-1]);
1362             }
1363           }
1364         }
1365         @median_array = sort {$a <=> $b} @median_array ;
1366
1367         if( @median_array % 2 ){
1368           $return_value = $median_array[$#median_array / 2];
1369         } else {
1370           $return_value = ( $median_array[@median_array / 2] +
1371                             $median_array[(@median_array - 2) / 2] ) / 2;
1372         }
1373
1374         $self -> {'median'}[$column] = $return_value;
1375       }
1376 end median
1377
1378 # }}} median
1379
1380 # {{{ mean
1381
1382 start mean
1383   {
1384      # Returns mean value of a column
1385      # If a individual contains more then 1 value (i.e. if an
1386      # individual has different values in different samples a mean
1387      # value of all individuals if calculate first, then the mean
1388      # value of the column If hi_cutoff is defined the mean function
1389      # will cut all value below the cutoff, and set their value to
1390      # 0. It's used to calculate the HI-mean/LOW-mean of a column for
1391      # e.g. Hockey-stick covariates If both hi_cutoff and low_cutoff
1392      # are defined only the hi_cutoff will be used.  See L</max>.
1393      my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1394      $self -> synchronize;
1395      my $first_id = $self -> {'individuals'}[0];
1396        die "data -> mean: No individuals defined in data object based on ",
1397         $self -> full_name,"\n" unless defined $first_id;
1398
1399      my @data_row = split( /,/, $first_id -> subject_data ->[0] );
1400
1401      unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1402        unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1403          die "Error in data -> mean: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1404        } else {
1405               $column = $self -> {'column_head_indices'}{$column_head};
1406          }
1407      }
1408
1409         ## Here the calculation starts
1410         my $num_individuals = 0;
1411         my $sum = 0;
1412
1413      my $all_data_rows=0;
1414         foreach my $individual ( @{$self ->{'individuals'}} ) {
1415
1416           my $ifactors = $individual -> subject_data;
1417           my $individual_sum = 0;
1418           my $data_rows = 0;
1419           for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1420
1421             # data is stored in strings. We need to split them into an
1422             # array.
1423
1424             my @data_row = split( /,/, $ifactors -> [$i] );
1425             if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1426 #             print "Skipping row with missing data\n";
1427               next;
1428             }
1429
1430             if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1431 #             print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1432               next;
1433             }
1434
1435             if (defined $hi_cutoff) {
1436               if ($data_row[$column-1]>$hi_cutoff) {
1437                 $individual_sum += $data_row[$column-1]-$hi_cutoff;
1438               }
1439             }
1440             else {
1441               if (defined $low_cutoff) {
1442                 if ($data_row[$column-1]<$low_cutoff) {
1443                   $individual_sum += $low_cutoff - $data_row[$column-1];
1444                 }
1445               }
1446               else {
1447                 $individual_sum += $data_row[$column-1];
1448               }
1449             }
1450             $data_rows++;
1451           }
1452           if( $global_mean ) {
1453             $sum += $individual_sum;
1454             $num_individuals += $data_rows;
1455           } else {
1456             if( $data_rows != 0 ) {
1457               $sum += $individual_sum/$data_rows;
1458             }
1459             $num_individuals ++;
1460           }
1461           $all_data_rows += $data_rows;
1462         }
1463      if( $num_individuals != 0 ) {
1464        $return_value = $sum / $num_individuals;
1465      }
1466 #     print "DR: $all_data_rows\n";
1467 #     print "\nNIM: $num_individuals $return_value\n";
1468 }
1469
1470 end mean
1471
1472 # }}} mean
1473
1474 # {{{ sd
1475
1476 start sd
1477   {
1478     # This sub returns standard deviation for a specific column
1479     # If there are more than one sample/individual the value used for that specific
1480     # individual is the mean value of its samples.
1481     # The cut-offs are for hockey stick variables. I.e. If one individual value is
1482     # lower than the hi-cutoff the individual value will be zero.
1483     # HI_cutoff is used to calculate the HI-mean of a column.
1484     # If cut_off is undef it won't be used
1485     # See L</max>.
1486      my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1487      $self -> synchronize;
1488      my $first_id = $self -> {'individuals'}[0];
1489      debug -> die( message => "No individuals defined in data object based on ".
1490                    $self -> full_name ) unless defined $first_id;
1491
1492      my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1493
1494      unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1495        unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1496          debug -> die( message => "Unknown column: \"$column_head\" or "
1497                        ."invalid column number: \"$column\"" );
1498        } else {
1499          $column = $self -> {'column_head_indices'}{$column_head};
1500        }
1501      }
1502
1503      ## Here the calculation starts
1504      my $num_individuals = 0;
1505      my $sum = 0;
1506      my $mean;
1507      if (defined $hi_cutoff) {
1508        $mean = $self->mean(column   => $column,
1509                            hi_cutoff => $hi_cutoff,
1510                            global_mean => $global_sd );
1511      } elsif (defined $low_cutoff) {
1512        $mean = $self->mean(column   => $column,
1513                            low_cutoff => $low_cutoff,
1514                            global_mean => $global_sd );
1515      } else {
1516        $mean = $self->mean( column        => $column,
1517                             subset_column => $subset_column,
1518                             subset_syntax => $subset_syntax,
1519                             global_mean => $global_sd );
1520      }
1521
1522      foreach my $individual ( @{$self -> {'individuals'}} ) {
1523        my $ifactors = $individual -> subject_data;
1524        my $individual_sum = 0;
1525        my $data_rows = 0;
1526        for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1527
1528          # data is stored in strings. We need to split them into an
1529          # array.
1530
1531          my @data_row = split( /,/, $ifactors -> [$i] );
1532
1533          if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1534 #          print "Skipping row with missing data\n";
1535            next;
1536          }
1537
1538          if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1539 #          print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1540            next;
1541          }
1542
1543          if (defined $hi_cutoff) {
1544            if ($data_row[$column-1]>$hi_cutoff) {
1545              if( $global_sd ) {
1546                $individual_sum += ($data_row[$column-1] - $hi_cutoff - $mean) ** 2;
1547              } else {
1548                $individual_sum += $data_row[$column-1]-$hi_cutoff;
1549              }
1550            }
1551          } else {
1552            if (defined $low_cutoff) {
1553              if ($data_row[$column-1]<$low_cutoff) {
1554                if( $global_sd ) {
1555                  $individual_sum += ($low_cutoff - $data_row[$column-1] - $mean) ** 2;
1556                } else {
1557                  $individual_sum += $low_cutoff - $data_row[$column-1];
1558                }
1559              }
1560            } else {
1561              if( $global_sd ) {
1562                $individual_sum += ($data_row[$column-1] - $mean) ** 2;
1563              } else {
1564                $individual_sum += $data_row[$column-1];
1565              }
1566            }
1567          }
1568          $data_rows++;
1569        }
1570        if( $global_sd ) {
1571          $sum += $individual_sum;
1572          $num_individuals += $data_rows;
1573        } else {
1574          if( $data_rows != 0 ) {
1575            $sum += ($individual_sum/$data_rows - $mean) ** 2;
1576          }
1577          $num_individuals++;
1578        }
1579      }
1580      if( $num_individuals < 2 ) {
1581        $return_value = 0;
1582      } else {
1583        if( $num_individuals != 0 ) {
1584          $return_value = (1/($num_individuals-1)*$sum) ** 0.5;
1585        }
1586      }
1587    }
1588
1589 end sd
1590
1591 # }}} sd
1592
1593 # {{{ range
1594 start range
1595       {
1596         # See L</max>.
1597         my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1598         if ( defined $self -> {'range'}[$tmp_column] ) {
1599           $return_value = $self -> {'range'}[$tmp_column];
1600         } else {
1601           my $old_target = $self -> {'target'};
1602           $self -> {'target'} = 'mem';
1603           $self -> synchronize;
1604           $return_value = $self -> max( column      => $column,
1605                                         column_head => $column_head ) -
1606                                           $self -> min( column      => $column,
1607                                                         column_head => $column_head );
1608           $self -> {'range'}[$column] = $return_value;
1609           if ( $old_target eq 'disk' ) {
1610             $self -> flush if ( $self -> {'target'} eq 'disk' );
1611             $self -> {'target'} = 'disk';
1612           }
1613         }
1614       }
1615 end range
1616 # }}} range
1617
1618 # {{{ recalc_column
1619 start recalc_column
1620       {
1621           # Recalculates a column based on expression. Also, see L</max>.
1622         $self -> synchronize;
1623
1624         # Check if $column(-index) is defined and valid, else try to find index using column_head
1625         my $first_id = $self -> {'individuals'}[0];
1626         die "data -> recalc_column: No individuals defined in data object based on ",
1627           $self -> full_name,"\n" unless defined $first_id;
1628
1629         my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1630
1631         unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1632           if(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1633             die "Error in data -> recalc_column: unknown column: \"$column_head\" or column number: \"$column\"\n";
1634           } else {
1635             $column = $self -> {'column_head_indices'}{$column_head};
1636           }
1637         }
1638
1639         for my $individual ( @{$self -> {'individuals'}} ) {
1640           $individual -> recalc_column( column     => $column,
1641                                         expression => $expression );
1642         }
1643       }
1644 end recalc_column
1645 # }}} recalc_column
1646
1647 # {{{ renumber_ascending
1648
1649 start renumber_ascending
1650       {
1651 # Renumbers the individuals (changes the subject identifiers) so that
1652 # all have unique integer numbers starting with start_at and
1653 # ascending. The primary use of this
1654 # method is not to order the individuals after their identifiers but to
1655 # ensure that all individuals have unique identifiers.
1656
1657         $self -> synchronize;
1658         foreach my $individual ( @{$self -> {'individuals'}} ) {
1659           $individual -> idnumber ( $start_at++ );
1660         }
1661         $self -> {'synced'} = 0;
1662       }
1663 end renumber_ascending
1664
1665 # }}} renumber_ascending
1666
1667 # {{{ renumber_descending
1668
1669 start renumber_descending
1670       {
1671           # See L</renumber_ascending>.
1672         $self -> synchronize;
1673         foreach my $individual ( @{$self -> {'individuals'}} ) {
1674           $individual -> idnumber ( $start_at-- );
1675         }
1676         $self -> {'synced'} = 0;
1677       }
1678 end renumber_descending
1679
1680 # }}} renumber_descending
1681
1682 # {{{ single_valued_data
1683
1684 start single_valued_data
1685       {
1686         # Usage:
1687         #
1688         # ($single_value_data_set, $remainder, $column_indexes) =
1689         #      $data_object -> single_valued_data( subset_name         => 'subset.dta',
1690         #                                          remainder_name      => 'remainder.dta',
1691         #                                          target              => 'disk',
1692         #                                          do_not_test_columns => [1..18,24,26];
1693         #
1694         # my $single_value_column_indexes = $column_indexes -> [0];
1695         # my $all_other_column_indexes    = $column_indexes -> [1];
1696         #
1697         # Analyses the content of each column, based on the
1698         # ID column, and returns two new data objects: One
1699         # that contains all columns that is has only one value per
1700         # individual and one that contains the
1701         # remainding data. This is useful for creating compact 'extra'
1702         # data sets that can be read in via user-defined sub-routines
1703         # when the number of columns needed exceeds the maximum that
1704         # NONMEM allows (e.g. 20 in NONMEM version V).
1705         #
1706         # The I<do_not_test_columns> argument specifies on which columns
1707         # to skip the single value test
1708
1709         my @multi_value_flags;
1710         my @individuals = @{$self -> {'individuals'}};
1711         # Initiate the flags:
1712         if ( defined $individuals[0] ) {
1713           my @data = @{$individuals[0] -> {'subject_data'}};
1714           my @data_row = split( /,/ , $data[0] );
1715           for ( my $i = 0; $i < scalar @data_row; $i++ ) {
1716             my $dnt_flag = 0;
1717             foreach my $dntc ( @do_not_test_columns ) {
1718               $dnt_flag = 1 if ( $i == $dntc - 1 );
1719             }
1720             $multi_value_flags[$i] = $dnt_flag;
1721           }
1722         } else {
1723           die "data -> single_valued_data: No data in ID number 1\n";
1724         }
1725         # Collect the stats
1726         for ( my $id = 0; $id <= $#individuals; $id++ ) {
1727           my @data = @{$individuals[$id] -> {'subject_data'}};
1728           my @data_row = split( /,/, $data[0] );
1729           for ( my $j = 0; $j < scalar @data_row; $j++ ) {
1730             my %col_unique;
1731             for ( my $i = 0; $i <= $#data; $i++ ) {
1732               my @data_row = split( /,/ , $data[$i] );
1733               $col_unique{$data_row[$j]}++;
1734             }
1735             my $factors = scalar keys %col_unique;
1736             $multi_value_flags[$j]++ if ( $factors > 1 );
1737           }
1738         }
1739         for ( my $i = 0; $i <= $#multi_value_flags; $i++ ) {
1740           if ( $multi_value_flags[$i] ) {
1741             push ( @{$column_indexes[1]}, $i + 1);
1742           } else {
1743             push ( @{$column_indexes[0]}, $i + 1);
1744           }
1745         }
1746         ( $single_value_data_set, $remainder ) =
1747           $self -> subset_vertically( column_indexes      => $column_indexes[0],
1748                                       subset_name         => $subset_name,
1749                                       return_remainder    => 1,
1750                                       remainder_name      => $remainder_name,
1751                                       target              => $target,
1752                                       keep_first_row_only => 1);
1753       }
1754 end single_valued_data
1755
1756 # }}}
1757
1758 # {{{ subset_vertically
1759
1760 start subset_vertically
1761       {
1762         # Usage:
1763         #
1764         # $subset = $data_object -> subset_vertically ( column_indexes => [1,2,6],
1765         #                                               subset_name    => 'subset.dta' );
1766         #
1767         # This basic usage returns a new data object containing
1768         # columns 1,2 and 6 from the original data plus the
1769         # idcolumn. The new data object will be associated with the
1770         # file 'subset.dta'.
1771         #
1772         # You get the remaining data, i.e. the original data minus
1773         # the created subset by specifying
1774         #
1775         # ( $subset, $remainder ) =
1776         #      $data_object -> subset_vertically ( column_indexes   => [1,2,6],
1777         #                                          subset_name      => 'subset.dta',
1778         #                                          return_remainder => 1,
1779         #                                          remainder_name   => 'remainder.dta' );
1780         #
1781         # If you would like to flush the created data sets to disk and
1782         # save memory, set the I<target> argument to 'disk'. The
1783         # default value 'mem' will keep the whole data object in RAM.
1784         #
1785         # The I<keep_first_row_only> argument can be used to reduce
1786         # the size of the subset data obejct by excluding all but the
1787         # first row of data from each individual.
1788         #
1789         my @individuals = @{$self -> {'individuals'}};
1790         # Create remainder index array if necessary
1791         my @remainder_indexes;
1792         if ( defined $individuals[0] ) {
1793           my @data = @{$individuals[0] -> {'subject_data'}};
1794           my $idcolumn = $individuals[0] -> {'idcolumn'};
1795 #         print "IC: $idcolumn\n";
1796           my $id_flag = 0;
1797           foreach my $use_index ( @column_indexes ) {
1798             $id_flag = 1 if ( $use_index == $idcolumn );
1799           }
1800           if ( $return_remainder ) {
1801 #           @remainder_indexes = ( $idcolumn );
1802             for ( my $i = 0; $i < scalar split(/,/,$data[0]); $i++ ) {
1803               my $rem_flag = 1;
1804               foreach my $use_index ( @column_indexes ) {
1805                 $rem_flag = 0 if ( $i == $use_index -1 );
1806 # or
1807 #                                  $i == $idcolumn -1 );
1808               }
1809               push( @remainder_indexes, $i + 1 ) if ( $rem_flag );
1810             }
1811             unshift( @remainder_indexes, $idcolumn ) if ( $id_flag );
1812           }
1813           unshift( @column_indexes, $idcolumn ) unless ( $id_flag );
1814         } else {
1815           die "data -> single_valued_data: No data in ID number 1\n";
1816         }
1817
1818 #       print "SS: @column_indexes\n";
1819 #       print "R : @remainder_indexes\n";
1820
1821         my @new_ids;
1822         my @new_ids_2;
1823         for ( my $id = 0; $id <= $#individuals; $id++ ) {
1824           my $idnumber = $individuals[$id] -> idnumber;
1825           my $idcolumn = $individuals[$id] -> idcolumn;
1826           my @data = @{$individuals[$id] -> {'subject_data'}};
1827           my @new_data;
1828           my @new_data_2;
1829           my $use_rows = $keep_first_row_only ? 0 : $#data;
1830           for ( my $i = 0; $i <= $use_rows; $i++ ) {
1831             my @new_row;
1832             my @data_row = split( /,/, $data[$i] );
1833             foreach my $use_index ( @column_indexes ) {
1834               push( @new_row, $data_row[$use_index-1] );
1835             }
1836 #           print "@new_row $#new_row\n";
1837             push( @new_data, join( ',', @new_row ) );
1838           }
1839           for ( my $i = 0; $i <= $#data; $i++ ) {
1840             if ( $return_remainder ) {
1841               my @new_row_2;
1842               my @data_row = split( /,/, $data[$i] );
1843               foreach my $use_index ( @remainder_indexes ) {
1844                 push( @new_row_2, $data_row[$use_index-1] );
1845               }
1846 #             print "@new_row_2 $#new_row_2\n";
1847               push( @new_data_2, join( ',' , @new_row_2 ) );
1848             }
1849           }
1850           my $new_id = data::individual -> new( idnumber     => $idnumber,
1851                                                 idcolumn     => $idcolumn,
1852                                                 subject_data => \@new_data );
1853           push( @new_ids, $new_id );
1854           if ( $return_remainder ) {
1855             my $new_id_2;
1856             $new_id_2 = data::individual -> new( idnumber     => $idnumber,
1857                                                  idcolumn     => $idcolumn,
1858                                                  subject_data => \@new_data_2 );
1859           push( @new_ids_2, $new_id_2 );
1860           }
1861         }
1862         my @header = @{$self -> {'header'}};
1863         my @new_header;
1864         foreach my $use_index ( @column_indexes ) {
1865           push( @new_header, @header[$use_index-1] );
1866         }
1867         my $comment;
1868         if( defined $self -> {'comment'} ){
1869           my @comment = @{$self -> {'comment'}};
1870           $comment = \@comment;
1871         }
1872         $subset = data -> new ( filename             => $subset_name,
1873                                 directory            => $self -> {'directory'},
1874                                 ignoresign           => $self -> {'ignoresign'},
1875                                 header               => \@new_header,
1876                                 comment              => $comment,
1877                                 individuals          => \@new_ids,
1878                                 target               => $target,
1879                                 ignore_missing_files => 1 );
1880         if ( $return_remainder ) {
1881           my @new_header_2;
1882           foreach my $use_index ( @remainder_indexes ) {
1883             push( @new_header_2, @header[$use_index-1] );
1884           }
1885           $remainder = data -> new ( filename             => $remainder_name,
1886                                      directory            => $self -> {'directory'},
1887                                      ignoresign           => $self -> {'ignoresign'},
1888                                      header               => \@new_header_2,
1889                                      comment              => $comment,
1890                                      individuals          => \@new_ids_2,
1891                                      target               => $target,
1892                                      ignore_missing_files => 1 );
1893         }
1894       }
1895 end subset_vertically
1896
1897 # }}}
1898
1899 # {{{ subsets
1900
1901 start subsets
1902       {
1903 #       if ( defined $expression and defined $bins ) {
1904 #         die "data -> subset: expression and bins may not both be specified\n";
1905 #       }
1906 #       if ( not ( defined $expression or defined $bins ) ) {
1907 #         die "data -> subset: expression or bins must be specified\n";
1908 #       }
1909         $self -> synchronize;
1910         my @header  = @{$self -> {'header'}};
1911         my @comment = defined $self -> {'comment'} ? @{$self -> {'comment'}} : ();
1912         my @subset_ids= ();
1913         my %rnd_ids;
1914         my $key = 0;
1915         my @ids = @{$self -> {'individuals'}};
1916         if ( defined $stratify_on ) {
1917           my $work_data = $self -> copy( filename => 'work_data.dta',
1918                                          target   => 'mem' );
1919           my %strata = %{$work_data -> factors( column => $stratify_on )};
1920 #         $Data::Dumper::Maxdepth = 1;
1921 #         print Dumper \%strata;
1922
1923           while ( my ( $factor, $keys ) = each %strata ) {
1924               foreach my $key ( @{$keys} ) {
1925                   my $rnd_num = rand;
1926                   while ( defined $rnd_ids{$factor}{$rnd_num} ) {
1927                       $rnd_num = rand;
1928                   }
1929                   $rnd_ids{$factor}{$rnd_num} = $ids[$key];
1930               }
1931           }
1932           my $first = 1;
1933           while ( my ( $factor, $rnd_nums ) = each %rnd_ids ) {
1934               my @sort_rnd_nums = sort { $a <=> $b } keys %{$rnd_nums};
1935               for ( my $i = 0; $i <= $#sort_rnd_nums; $i ) {
1936                   for ( my $j = 0; $j < $bins; $j++ ) {
1937                       if ( $first ) {
1938                           push( @subset_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy] );
1939                           push( @incl_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber] );
1940                       } else {
1941                           push( @{$subset_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy );
1942                           push( @{$incl_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber );
1943                       }
1944                       $i++;
1945                       last if $i > $#sort_rnd_nums;
1946                   }
1947                   $first = 0;
1948               }
1949           }
1950           for ( my $j = 0; $j < $bins; $j++ ) {
1951             my $sdata = data -> new ( header               => \@header,
1952                                       comment              => \@comment,
1953                                       ignoresign           => $self -> {'ignoresign'},
1954                                       individuals          => $subset_ids[$j],
1955                                       ignore_missing_files => 1,
1956                                       target               => 'disk',
1957                                       idcolumn             => $self -> {'idcolumn'},
1958                                       filename             => "subset_$j.dta" );
1959             #$sdata -> _write;
1960             push( @subsets, $sdata );
1961           }
1962         } else {
1963           for ( my $i = 0; $i <= $#ids; $i++ ) {
1964             my $rnd_num = rand;
1965             while ( defined $rnd_ids{$rnd_num} ) {
1966               $rnd_num = rand;
1967             }
1968             $rnd_ids{$rnd_num} = $ids[$i];
1969           }
1970           my @keys = sort { $a <=> $b } keys %rnd_ids;
1971           my $first = 1;
1972           for ( my $i = 0; $i <= $#keys; $i ) {
1973             for ( my $j = 0; $j < $bins; $j++ ) {
1974               if ( $first ) {
1975                 push( @subset_ids, [$rnd_ids{$keys[$i]} -> copy] );
1976                 push( @incl_ids, [$rnd_ids{$keys[$i]} -> idnumber] );
1977               } else {
1978                 push( @{$subset_ids[$j]}, $rnd_ids{$keys[$i]} -> copy );
1979                 push( @{$incl_ids[$j]}, $rnd_ids{$keys[$i]} -> idnumber );
1980               }
1981               $i++;
1982               last if $i > $#keys;
1983             }
1984             $first = 0;
1985           }
1986           for ( my $j = 0; $j < $bins; $j++ ) {
1987             my $sdata = data -> new ( header               => \@header,
1988                                       comment              => \@comment,
1989                                       ignoresign           => $self -> {'ignoresign'},
1990                                       individuals          => $subset_ids[$j],
1991                                       ignore_missing_files => 1,
1992                                       target               => $target,
1993                                       idcolumn             => $self -> {'idcolumn'},
1994                                       filename             => "subset_$j.dta" );
1995             #$sdata -> _write;
1996             push( @subsets, $sdata );
1997           }
1998         }
1999       }
2000 end subsets
2001
2002 # }}} subsets
2003
2004 # {{{ subset
2005
2006 start subset
2007       {
2008         $self -> synchronize;
2009         my @header  = @{$self -> {'header'}};
2010         my @comment = defined $self -> {'comment'} ? @{$self -> {'comment'}} : ();
2011         my @subset_inds = ();
2012         my $key = 0;
2013         foreach my $individual ( @{$self -> {'individuals'}} ) {
2014           if ( $individual -> evaluate_expression( column     => $based_on,
2015                                                    expression => $expression ) ) {
2016             push( @subset_inds, $individual -> copy );
2017             push( @incl_individuals, $individual -> idnumber );
2018             push( @included_keys, $key );
2019           }
2020           $key++;
2021         }
2022         $subset = data -> new ( header      => \@header,
2023                                 comment     => \@comment,
2024                                 ignoresign  => $self -> {'ignoresign'},
2025                                 individuals => \@subset_inds,
2026                                 idcolumn    => $self -> {'idcolumn'},
2027                                 filename    => "subset.dta" );
2028       }
2029 end subset
2030
2031 # }}} subset
2032
2033 # {{{ target
2034
2035 start target
2036       {
2037         if ( $parm eq 'disk' and $self -> {'target'} eq 'mem' ) {
2038           $self -> {'target'} = 'disk';
2039           $self -> flush;
2040         } elsif ( $parm eq 'mem' and $self -> {'target'} eq 'disk' ) {
2041           $self -> {'target'} = 'mem';
2042           $self -> synchronize;
2043         }
2044       }
2045 end target
2046
2047 # }}}
2048
2049 # {{{ _write
2050
2051 start _write
2052     {
2053       die "ERROR: data -> _write: No filename set in data object.\n"
2054           if( $filename eq '' );
2055
2056 #      $Data::Dumper::Maxdepth = 2;
2057 #      die Dumper $self -> {'individuals'};
2058
2059       if( not defined $self -> {'individuals'} ){
2060
2061         # If we don't have any individuals and write to a new
2062         # filename, we must first read individuals from the old
2063         # file. A call to synchronize will do that. There is no risk
2064         # of a infinite loop here since synchronize allways writes to
2065         # "full_name".
2066
2067         unless( $filename eq $self -> full_name ){
2068           $self -> synchronize;
2069         }
2070       }
2071
2072       open(FILE,">$filename") ||
2073           die "Could not create $filename\n";
2074       my $data_ref = $self -> format_data;
2075       my @data = @{$data_ref};
2076       for ( @data ) {
2077         print ( FILE );
2078       }
2079       close(FILE);
2080
2081 #       if ( $PsN::config -> {'_'} -> {'use_database'} and
2082 #          $self -> {'use_data_table'} ) {
2083 #       # Backslashes messes up the sql syntax
2084 #       my $file_str = $self->{'filename'};
2085 #       my $dir_str = $self->{'directory'};
2086 #       $file_str =~ s/\\/\//g;
2087 #       $dir_str =~ s/\\/\//g;
2088
2089 #       # md5sum
2090 #       my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
2091 #       my ( $date_str, $time_str );
2092 #       if ( $Config{osname} eq 'MSWin32' ) {
2093 #         $date_str = `date /T`;
2094 #         $time_str = ' '.`time /T`;
2095 #       } else {
2096 #         # Assuming UNIX
2097 #         $date_str = `date`;
2098 #       }
2099 #       chomp($date_str);
2100 #       chomp($time_str);
2101 #       my $date_time = $date_str.$time_str;
2102 #       my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2103 #                              ";databse=".$PsN::config -> {'_'} -> {'project'},
2104 #                                $PsN::config -> {'_'} -> {'user'},
2105 #                               $PsN::config -> {'_'} -> {'password'},
2106 #                                {
2107 #                                 'RaiseError' => 1});
2108 #       my $sth;
2109 #       if ( defined $self -> {'data_id'} ) {
2110 #         $sth = $dbh -> prepare( "UPDATE ".$PsN::config -> {'_'} -> {'project'}.
2111 #                              ".data ".
2112 #                                 "SET filename='$file_str',date='$date_time',".
2113 #                                 "directory='$dir_str',md5sum='$md5sum' ".
2114 #                                 "WHERE data_id='".$self -> {'data_id'}."'" );
2115 #         $sth -> execute or debug -> die( message => $sth->errstr ) ;
2116 #       } else {
2117 #         $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2118 #                              ".data (filename,date,directory,md5sum) ".
2119 #                                "VALUES ('$file_str', '$date_time', '$dir_str','".
2120 #                                $md5sum."' )");
2121 #         $sth -> execute;
2122 #         $self -> {'data_id'} = $sth->{'mysql_insertid'};
2123 #       }
2124 #       $sth -> finish;
2125 #       $dbh -> disconnect;
2126 #       }
2127     }
2128 end _write
2129
2130 # }}} _write
2131
2132 # {{{ flush
2133 start flush
2134     {
2135       # synchronizes the object with the file on disk and empties
2136       # most of the objects attributes to save memory.
2137       if( defined $self -> {'individuals'} and
2138           ( !$self -> {'synced'} or $force ) ) {
2139         $self -> _write;
2140       }
2141 #      $self -> {'header'} = undef;
2142       $self -> {'comment'} = undef;
2143       $self -> {'individuals'} = undef;
2144       $self -> {'synced'} = 0;
2145       $self -> {'column_head_indices'} = undef;
2146       $self -> {'have_missing_data'} = undef;
2147     }
2148 end flush
2149 # }}} flush
2150
2151 # {{{ synchronize
2152
2153 start synchronize
2154     {
2155       # synchronizes the object with the file on disk
2156       unless( $self -> {'synced'} ){
2157         if( defined $self -> {'individuals'} and
2158             scalar @{$self -> {'individuals'}} > 0 ){
2159           # We should not read new data from file if we
2160           # have an individuals defined?
2161           # Perhaps there should be an attribute
2162           # 'from_file' that overrides this and reads in
2163           # the data from the file specified in filename
2164           # and overwrites whatever the object already
2165           # contains?
2166 #           if( -e $self -> {'filename'} ){
2167 #             $self -> _read_header;
2168 #             $self -> _read_individuals;
2169 #           }
2170           $self -> _write;
2171         } else {
2172           if( -e $self -> full_name ){
2173             unless( defined $self -> {'header'} and scalar @{$self -> {'header'}} > 0 ){
2174               $self -> _read_header;
2175             }
2176             $self -> _read_individuals;
2177           } else {
2178             debug -> die( message => "Fatal error: datafile: " . $self -> full_name . " does not exist." );
2179             return;
2180           }
2181         }
2182       }
2183       my $i = 1;
2184       foreach my $head ( @{$self -> {'header'}} ){
2185         $self -> {'column_head_indices'} -> {$head} = $i;
2186         $i++;
2187       }
2188       $self -> {'synced'} = 1;
2189     }
2190 end synchronize
2191
2192 # }}} synchronize
2193
2194 # {{{ _fisher_yates_shuffle
2195
2196 start _fisher_yates_shuffle
2197       {
2198         my $arr_ref = $parm{'array'};
2199         debug -> warn( level   => 1,
2200                        message => "Array of zero length received" )
2201           if ( scalar @{$arr_ref} < 1 );
2202         my $i;
2203         for ($i = @$arr_ref; --$i; ) {
2204           my $j = random_uniform_integer(1,0,$i);
2205 #          my $j = int rand ($i+1);
2206 #         print "$j $j_new\n";
2207           @$arr_ref[$i,$j] = @$arr_ref[$j,$i];
2208         }
2209       }
2210 end _fisher_yates_shuffle
2211
2212 # }}} _fisher_yates_shuffle
2213
2214 # {{{ _read_header
2215
2216 start _read_header
2217       {
2218         my $filename   = $self -> full_name;
2219         my $ignoresign = $self -> ignoresign;
2220         my ( @data, @new_record, $row, $tmp_row, @header, $hdrstring );
2221
2222         open(DATAFILE,"$filename") ||
2223           die "Could not open $filename for reading";
2224         my $columns;
2225         while (<DATAFILE>) {
2226           s/\s*\,\s*/\,/g;
2227           $tmp_row    = $_;
2228           #     @new_record = split(/\,|\s+/,$_);
2229           if ( ! (/^\s*\d+|^\s*\./) ) {
2230             $data[$row] = $tmp_row;
2231             $row++;
2232           } else {
2233             # We have reached the first data-row, return.
2234             $columns = scalar split(/\,\s*|\s+/);
2235             last;
2236           }
2237         }
2238         close(DATAFILE);
2239
2240         if ( defined $self -> {'cont_column'} and not $self -> {'table_file'} ) {
2241           my $data_len = $#data;
2242           for ( my $i = $data_len; $i >= 0; $i-- ) {
2243             my @arr = split(/\,\s*|\s+/,$data[$i]);
2244             if ( $arr[$self -> {'cont_column'}-1] eq 'CONT' ) {
2245               my $start = $i == $data_len ? 0 : 1;
2246               for ( my $j = $start; $j <= $#arr; $j++ ) {
2247                 if ( $j != ($self -> {'cont_column'}-1) ) {
2248                   push( @header, $arr[$j] );
2249                 }
2250               }
2251               pop( @data );
2252             }
2253           }
2254           # the \Q and \E here are to escape wierd ignoresigns
2255           $header[0] =~ s/\Q$ignoresign\E//
2256               if ( defined $self->ignoresign );
2257           shift( @header ) if ( $header[0] eq "" );
2258         } else {
2259           chomp( $hdrstring = pop(@data));
2260           @header = split(/\,\s*|\s+/,$hdrstring);
2261           # the \Q and \E here are to escape wierd ignoresigns
2262           $header[0] =~ s/\Q$ignoresign\E//
2263               if ( defined $self->ignoresign );
2264           shift( @header ) if ( $header[0] eq "" );
2265           if( $self -> {'table_file'} ) {
2266             my @new_header;
2267             for( my $i = 1; $i <= scalar @header; $i++ ) {
2268               if( $header[$i-1] eq 'CONT' ) {
2269                 if ( defined $self -> {'cont_column'} and not $i == $self -> {'cont_column'} ) {
2270                   debug -> warn( level   => 1,
2271                                  message => "The supplied columns for the CONT data item (".
2272                                  $self -> {'cont_column'}.") does not match the column where the CONT ".
2273                                  "header was found ($i), using $i" );
2274                 }
2275                 $self -> {'cont_column'} = $i;
2276               } else {
2277                 push( @new_header, $header[$i-1] );
2278               }
2279             }
2280             @header = @new_header;
2281             for( my $i = 1; $i <= scalar @header; $i++ ) {
2282               if( $header[$i-1] eq 'ID' ) {
2283                 if ( defined $self -> {'idcolumn'} and not $i == $self -> {'idcolumn'} ) {
2284                   debug -> warn( level   => 1,
2285                                  message => "The supplied columns for the ID data item (".
2286                                  $self -> {'idcolumn'}.") does not match the column where the CONT ".
2287                                  "header was found ($i), using $i" );
2288                 }
2289                 $self -> {'idcolumn'} = $i;
2290               }
2291             }
2292           }
2293         }
2294
2295 # I'm not certain on how to deal with this conflict. I'm leaving it commented because I believe this code should not be here.
2296 #
2297 #<<<<<<< data_subs.pm
2298 #       $header[0] =~ s/$ignoresign//
2299 #         if ( defined $self->ignoresign );
2300 #       shift( @header ) if ( $header[0] eq "" );
2301 #=======
2302 #>>>>>>> 1.28
2303
2304 # It is ok with data sets without a header.
2305 #       unless( scalar @header > 0 ){ debug -> die( message => 'Datafile ' . $self -> full_name . ' is empty.' ); }
2306
2307         $self -> {'header'} = \@header;
2308         $self -> {'comment'} = \@data;
2309 #       if ( $PsN::config -> {'_'} -> {'use_database'} and
2310 #            $self -> {'use_data_table'} ) {
2311 #         my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2312 #                              ";databse=".$PsN::config -> {'_'} -> {'project'},
2313 #                                  $PsN::config -> {'_'} -> {'user'},
2314 #                               $PsN::config -> {'_'} -> {'password'},
2315 #                                  {'RaiseError' => 1});
2316 #         if ( scalar @header < 1 ) {
2317 #           for ( my $i = 1; $i <= $columns; $i++ ) {
2318 #             push( @header, $i );
2319 #           }
2320 #         }
2321 #         for ( my $i = 0; $i <= $#header; $i++ ) {
2322 #           my $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2323 #                              ".data_column ".
2324 #                                     "(name,number,data_id) ".
2325 #                                     "VALUES ('".$header[$i]."', '".($i+1).
2326 #                                     "', '".$self -> {'data_id'}."' )");
2327 #           $sth -> execute;
2328 #           push( @{$self -> {'data_column_ids'}}, $sth->{'mysql_insertid'} );
2329 #           $sth -> finish;
2330 #         }
2331 #         $dbh -> disconnect;
2332 #       }
2333       }
2334 end _read_header
2335
2336 # }}} _read_header
2337
2338 # {{{ _read_individuals
2339
2340 start _read_individuals
2341       {
2342         my $idcol        = $self -> idcolumn;
2343         my $filename     = $self -> full_name;
2344         #debug -> warn( level   => 1,
2345         #              message => "Building array of individuals from file " . $self -> {'filename'} );
2346         open(DATAFILE,"$filename") ||
2347           die "Could not open $filename for reading";
2348         my ( @new_row, $new_ID, $old_ID, @init_data );
2349         my $buffer;
2350         my $lines = 0;
2351         while (sysread DATAFILE, $buffer, 4096) {
2352           $lines += ($buffer =~ tr/\n//);
2353         }
2354         seek( DATAFILE, 0,0 );
2355
2356         # For status bar:
2357         my $status_bar = status_bar -> new( steps => $lines );
2358
2359         ui -> print( category => 'scm',
2360                      message  => "Reading data file: ".$self -> filename );
2361         ui -> print( category => 'scm',
2362                      message => $status_bar -> print_step(),
2363                      newline => 0);
2364
2365         my ( $sth, $dbh, $first_row_id, $first_value_id );
2366         my $insert = 1;
2367 #       if ( $PsN::config -> {'_'} -> {'use_database'} and
2368 #            $self -> {'use_data_table'} ) {
2369 #         $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2370 #                              ";databse=".$PsN::config -> {'_'} -> {'project'},
2371 #                               $PsN::config -> {'_'} -> {'user'},
2372 #                               $PsN::config -> {'_'} -> {'password'},
2373 #                               {'RaiseError' => 1});
2374 #         my $sth = $dbh -> prepare( "SELECT data_row_id FROM ".$PsN::config -> {'_'} -> {'project'}.
2375 #                              ".data_row ".
2376 #                                    "WHERE data_id='".$self -> {'data_id'}."'" );
2377 #         $sth -> execute or debug -> die( message => $sth->errstr ) ;
2378 #         my $select_arr = $sth -> fetchall_arrayref;
2379 #         if ( scalar @{$select_arr} > 0 ) {
2380 #           for ( my $i = 0; $i < scalar @{$select_arr}; $i++ ) {
2381 #             push( @{$self -> {'data_row_ids'}}, $select_arr->[$i][0] );
2382 #           }
2383 #           $sth = $dbh -> prepare( "SELECT data_value_id FROM ".$PsN::config -> {'_'} -> {'project'}.
2384 #                              ".data_value ".
2385 #                                   "WHERE data_id='".$self -> {'data_id'}."'" );
2386 #           $sth -> execute or debug -> die( message => $sth->errstr ) ;
2387 #           my $select_val = $sth -> fetchall_arrayref;
2388 #           for ( my $i = 0; $i < scalar @{$select_val}; $i++ ) {
2389 #             push( @{$self -> {'data_value_ids'}}, $select_val->[$i][0] );
2390 #           }
2391 #           $insert = 0;
2392 #           $dbh -> disconnect;
2393 #         } else {
2394 #           $dbh -> do( "LOCK TABLES ".$PsN::config -> {'_'} -> {'project'}.
2395 #                              ".data_row WRITE, ".$PsN::config -> {'_'} -> {'project'}.
2396 #                              ".data_value WRITE" );
2397 #           $sth = $dbh -> prepare( "SELECT MAX(data_row_id) FROM ".$PsN::config -> {'_'} -> {'project'}.
2398 #                              ".data_row" );
2399 #           $sth -> execute or debug -> die( message => $sth->errstr ) ;
2400 #           my $select_arr = $sth -> fetchall_arrayref;
2401 #           $first_row_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
2402 #           $sth = $dbh -> prepare( "SELECT MAX(data_value_id) FROM ".$PsN::config -> {'_'} -> {'project'}.
2403 #                              ".data_value" );
2404 #           $sth -> execute or debug -> die( message => $sth->errstr ) ;
2405 #           my $select_arr = $sth -> fetchall_arrayref;
2406 #           $first_value_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
2407 #         }
2408 #         $sth -> finish;
2409 #       }
2410
2411         my $insert_rows;
2412         my $insert_values;
2413         my $row_counter = 0;
2414         my $full_row;
2415       ROW: while ( <DATAFILE> ) {
2416           s/^ *//;
2417           s/\s*\,\s*/\,/g;
2418           my @new_row   = split(/\,\s*|\s+/);
2419           # This regexp check is not time consuming.
2420           if ( /^\s*\d+|^\s*\./ ) {
2421             if ( defined $self -> {'cont_column'} ) {
2422               if ( $new_row[$self -> {'cont_column'} - 1] == 1 ) {
2423                 if ( not $self -> {'table_file'} ) { # Skip the CONT=1 rows if this is a table file
2424                   for ( my $i = $#new_row; $i > 0; $i-- ) {
2425                     if ( $i != ($self -> {'cont_column'} - 1) ) {
2426                       unshift( @{$full_row}, $new_row[$i] );
2427                     }
2428                   }
2429                 }
2430                 next ROW;
2431               } else {
2432                 for ( my $i = $#new_row; $i >= 0; $i-- ) {
2433 #                 if ( $i != ($self -> {'cont_column'} - 1) or $self -> {'table_file'} ) {
2434                   if ( $i != ($self -> {'cont_column'} - 1) ) {
2435                     unshift( @{$full_row}, $new_row[$i] );
2436                   }
2437                 }
2438               }
2439             } else {
2440               @{$full_row} = @new_row;
2441             }
2442             $new_ID = $full_row -> [$idcol-1]; # index starts at 0
2443             $old_ID = $new_ID if ( not defined $old_ID );
2444
2445             # Check if column miss data at some row (This adds about 30% of init time)
2446             my $mdt = $self -> {'missing_data_token'};
2447             for( my $i = 0; $i <= $#{$full_row}; $i++ ){
2448                 $self -> {'have_missing_data'} -> {$i+1} = 1
2449                   if( $full_row -> [$i] == $mdt ); # == is slower but safer than eq
2450             }
2451 #           if ( $PsN::config -> {'_'} -> {'use_database'} and
2452 #                $self -> {'use_data_table'} and $insert ) {
2453 #             $row_counter++;
2454 #             $insert_rows = $insert_rows."," if ( defined $insert_rows );
2455 #             $insert_rows = $insert_rows.
2456 #               "('$row_counter', '".$self -> {'data_id'}."' )";
2457 #             for ( my $j = 0; $j <= $#{$full_row}; $j++ ) {
2458 #               $insert_values = $insert_values."," if ( defined $insert_values );
2459 #               $insert_values = $insert_values.
2460 #                 "('".$full_row -> [$j]."', '".
2461 #                   ($first_row_id+$row_counter)."', '".
2462 #                     $self -> {'data_column_ids'}->[$j].
2463 #                       "', '".$self -> {'data_id'}."' )";
2464 #             }
2465 #           }
2466
2467             if ( $new_ID != $old_ID ) {
2468               my @subject_data = @init_data;
2469               my $id = data::individual -> new ( idcolumn     => $idcol,
2470                                                  subject_data => \@subject_data,
2471                                                  data_id      => $self -> {'data_id'} );
2472               push( @{$self -> {'individuals'}}, $id );
2473               @init_data =(join( ",", @{$full_row}));
2474             } else {
2475               push( @init_data, join( ",", @{$full_row}) );
2476             }
2477             $old_ID = $new_ID;
2478             $full_row = undef;
2479           }
2480           if ( $status_bar -> tick() ) {
2481             ui -> print( category => 'scm',
2482                          message  => $status_bar -> print_step(),
2483                          wrap     => 0,
2484                          newline  => 0 );
2485           }
2486         }
2487
2488 #       if ( $PsN::config -> {'_'} -> {'use_database'} and
2489 #            $self -> {'use_data_table'} and $insert ) {
2490 #         $dbh -> do("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2491 #                    ".data_row ".
2492 #                    "(number,data_id) ".
2493 #                    "VALUES ".$insert_rows);
2494 #         push( @{$self -> {'data_row_ids'}}, ($first_row_id..$first_row_id+$row_counter) );
2495 #         $dbh -> do( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2496 #                     ".data_value ".
2497 #                     "(value,data_row_id,data_column_id,data_id) ".
2498 #                     "VALUES ".$insert_values );
2499 #         push( @{$self -> {'data_value_ids'}},
2500 #               ($first_value_id..$first_value_id+($row_counter*
2501 #                                                  scalar @{$self->{'data_column_ids'}})));
2502 #         $dbh -> do( "UNLOCK TABLES" );
2503 #         $dbh -> disconnect;
2504 #       }
2505
2506         if ( $#init_data >= 0 ) {
2507           push( @{$self -> {'individuals'}},
2508                 data::individual -> new ( idcolumn     => $idcol,
2509                                           subject_data => \@init_data ) );
2510         }
2511         ui -> print( category => 'scm',
2512                      message  => " ... done" );
2513         close(DATAFILE);
2514 #       $self -> _write( filename => 'test.dta' );
2515       }
2516 end _read_individuals
2517
2518 # }}} _read_individuals
2519