lib/data_subs.pm

   1 This file contains the code for the subroutines in
   2 Perl-speaks-NONMEMs data module. It is not functional by itself.
   3 The code should be transferred to the module autogenerated by dia2code
   4 using the fill_diacode.pl script.
   5
   6 # {{{ include
   7
   8 start include statements
   9 use Digest::MD5 'md5_hex';
  10 use OSspecific;
  11 use File::Copy "cp";
  12 use Carp;
  13 use Carp qw(cluck);
  14 use Config;
  15 use Math::Random;
  16 use Storable;
  17 use debug;
  18 use ui;
  19 use status_bar;
  20 use Data::Dumper;
  21 use Time::HiRes qw(gettimeofday);
  22 my @primary_column_names = ('ID', 'DATE', 'DAT1', 'DAT2', 'DAT3' ,'L1', 'L2', 'DV', 'MDV', 'RAW_', 'MRG_', 'RPT_', 'TIME', 'DROP', 'SKIP', 'EVID', 'AMT', 'RATE', 'SS', 'II', 'ADDL', 'CMT', 'PCMT', 'CALL');
  23 end include
  24
  25 # }}} include statements
  26
  27 # {{{ description
  28
  29 start description
  30     # The structure of the data class is subject-centric, recognising that
  31     # the subjects included in a study often can be regarded as
  32     # independent. A class for the subject level exists within PsN and is
  33     # called the individual class. A data object consists of at least one
  34     # but probably many individual objects plus optional comments.
  35 end description
  36
  37 # }}} description
  38
  39 # {{{ synopsis
  40
  41 start synopsis
  42     #   use data;
  43     #
  44     #   my $data_obj = data -> new ( filename => 'test040314.dta' );
  45     #
  46     #   $data_obj -> renumber_ascending;
  47     #
  48     #   my $subsets_ref = $data_obj -> case_deletion( bins => 10 );
  49     #
  50     #   my @subsets = @{$subsets_ref};
  51 end synopsis
  52
  53 # }}} synopsis
  54
  55 # {{{ see_also
  56
  57 start see_also
  58     # =begin html
  59     #
  60     # <a HREF="model.html">model</a>, <a HREF="output.html">output</a>,
  61     # <a HREF="tool/modelfit.html">tool::modelfit</a>,
  62     # <a HREF="tool.html">tool</a>
  63     #
  64     # =end html
  65     #
  66     # =begin man
  67     #
  68     # model, output, tool::modelfit, tool
  69     #
  70     # =end man
  71 end see_also
  72
  73 # }}} see_also
  74
  75 # {{{ new
  76
  77 start new
  78       {
  79         # If the column holding the subject identifier is not the
  80         # first, it can be specified using the I<idcolumn> attribute
  81         #
  82         # I<ignoresign> determines which rows that are regarded as
  83         # comments. Corresponds to the IGNORE= option in the $DATA
  84         # record in a NONMEM model file.
  85
  86         $this -> {'use_data_table'} = 0;
  87
  88         ( $this -> {'directory'},
  89           $this -> {'filename'} ) = OSspecific::absolute_path( $this -> {'directory'},
  90                                                                $this->{'filename'} );
  91
  92         debug -> warn( level => 2,
  93                        message => "data -> new: Data object initialized from file: ".
  94                        $this -> full_name );
  95
  96 #       sub register_in_database {
  97 #         my $this = shift;
  98 #         # Backslashes messes up the sql syntax
  99 #         my $file_str = $this->{'filename'};
 100 #         my $dir_str = $this->{'directory'};
 101 #         $file_str =~ s/\\/\//g;
 102 #         $dir_str =~ s/\\/\//g;
 103
 104 #         # md5sum
 105 #         my $md5sum = md5_hex(OSspecific::slurp_file($this-> full_name ));
 106 #         my $dbh = DBI ->
 107 #             connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
 108 #                     ";databse=".$PsN::config -> {'_'} -> {'project'},
 109 #                     $PsN::config -> {'_'} -> {'user'},
 110 #                     $PsN::config -> {'_'} -> {'password'},
 111 #                     {'RaiseError' => 1});
 112 #         my $sth;
 113 #         my $sth = $dbh -> prepare( "SELECT data_id FROM ".$PsN::config -> {'_'} -> {'project'}.
 114 #                              ".data ".
 115 #                                    "WHERE filename = '$file_str' AND ".
 116 #                                    "directory = '$dir_str' AND ".
 117 #                                    "md5sum = '".$md5sum."'" );
 118 #         $sth -> execute or debug -> die( message => $sth->errstr ) ;
 119 #         my $select_arr = $sth -> fetchall_arrayref;
 120 #         if ( scalar @{$select_arr} > 0 ) {
 121 #           debug -> warn( level   => 1,
 122 #                          message => "Found an old entry in the database matching the ".
 123 #                          "current data file" );
 124 #           if ( scalar @{$select_arr} > 1 ) {
 125 #             debug -> warn( level   => 1,
 126 #                            message => "Found more than one matching entry in database".
 127 #                            ", using the first" );
 128 #           }
 129 #           $this -> {'data_id'} = $select_arr->[0][0];
 130 #         } else {
 131 #           my ( $date_str, $time_str );
 132 #           if ( $Config{osname} eq 'MSWin32' ) {
 133 #             $date_str = `date /T`;
 134 #             $time_str = ' '.`time /T`;
 135 #           } else {
 136 #             # Assuming UNIX
 137 #             $date_str = `date`;
 138 #           }
 139 #           chomp($date_str);
 140 #           chomp($time_str);
 141 #           my $date_time = $date_str.$time_str;
 142 #           $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
 143 #                              ".data (filename,date,directory,md5sum) ".
 144 #                                  "VALUES ('$file_str', '$date_time', '$dir_str','".
 145 #                                  $md5sum."' )");
 146 #           $sth -> execute;
 147 #           $this -> {'data_id'} = $sth->{'mysql_insertid'};
 148 #         }
 149 #         $sth -> finish;
 150 #         $dbh -> disconnect;
 151 #         }
 152
 153
 154         unless ( ( defined $this -> {'header'} and
 155                    scalar @{$this -> {'header'}} > 0 ) or
 156                  ( defined $this -> {'individuals'} and
 157                    scalar @{$this -> {'individuals'}} > 0 ) ) {
 158           if ( -e $this -> full_name ) {
 159             if ( $this -> {'target'} eq 'mem' ) {
 160 #             &register_in_database( $this ) if ( $PsN::config -> {'_'} -> {'use_database'} and
 161 #                                                 $this -> {'use_data_table'} );
 162               $this -> _read_header;
 163               $this -> _read_individuals;
 164               $this -> {'synced'} = 1;
 165             } else {
 166               $this -> {'synced'} = 0;
 167             }
 168           } else {
 169             debug -> die(message => "No header, individuals, and no file " . $this -> full_name . " on disk.")
 170               unless $this -> {'ignore_missing_files'};
 171             $this ->  {'synced'} = 0;
 172           }
 173         } else {
 174           if ( $this -> {'target'} eq 'mem') {
 175             if ( -e $this -> {'filename'} ) {
 176               $this -> _read_header;
 177 #             &register_in_database if ( $PsN::config -> {'_'} -> {'use_database'} and
 178 #                                        $this -> {'use_data_table'} );
 179               $this -> _read_individuals;
 180               $this -> {'synced'} = 1;
 181             } else {
 182               debug -> die(message => "No file:".$this->{'filename'}." on disk" )
 183                   unless $this -> {'ignore_missing_files'};
 184               $this -> {'synced'} = 0;
 185             }
 186           } else {
 187             $this -> flush;
 188           }
 189         }
 190
 191         if ( $this -> {'synced'} ) {
 192           my $i = 1;
 193           foreach my $head ( @{$this -> {'header'}} ) {
 194             $this -> {'column_head_indices'} -> {$head} = $i;
 195             $i++;
 196           }
 197         }
 198 #       $Data::Dumper::Maxdepth = 3;
 199 #       die Dumper $this -> {'individuals'};
 200       }
 201 end new
 202
 203 # }}} new
 204
 205 # {{{ register_in_database
 206 start register_in_database
 207     if ( $PsN::config -> {'_'} -> {'use_database'} ) {
 208       # Backslashes messes up the sql syntax
 209       my $file_str = $self->{'filename'};
 210       my $dir_str = $self->{'directory'};
 211       $file_str =~ s/\\/\//g;
 212       $dir_str =~ s/\\/\//g;
 213
 214       my $project = $PsN::config -> {'_'} -> {'project'};
 215       # md5sum
 216       my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
 217
 218       my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
 219                                ";databse=".$project,
 220                                $PsN::config -> {'_'} -> {'user'},
 221                                $PsN::config -> {'_'} -> {'password'},
 222                                {'RaiseError' => 1});
 223
 224       my $sth;
 225
 226       my $select_arr = [];
 227
 228       if ( not $force ) {
 229         my $sth = $dbh -> prepare( "SELECT data_id FROM ".$project.
 230                                    ".data ".
 231                                    "WHERE filename = '$file_str' AND ".
 232                                    "directory = '$dir_str' AND ".
 233                                    "md5sum = '".$md5sum."'" );
 234         $sth -> execute or debug -> die( message => $sth->errstr ) ;
 235         $select_arr = $sth -> fetchall_arrayref;
 236       }
 237
 238       if ( scalar @{$select_arr} > 0 ) {
 239         'debug' -> warn( level   => 1,
 240                          message => "Found an old entry in the database matching the ".
 241                          "current data file" );
 242         if ( scalar @{$select_arr} > 1 ) {
 243           'debug' -> warn( level   => 1,
 244                            message => "Found more than one data matching entry in database".
 245                            ", using the first" );
 246         }
 247         $self -> {'data_id'} = $select_arr->[0][0];
 248         # Find the id's
 249         my $sth = $dbh -> prepare( "SELECT individual_id FROM ".$project.".data_individual ".
 250                                    "WHERE data_id = '".$self -> {'data_id'}."'" );
 251         $sth -> execute or debug -> die( message => $sth->errstr ) ;
 252         my $id_arr = $sth -> fetchall_arrayref;
 253         map( $_ = $_ -> [0], @{$id_arr} );
 254         $self -> {'individual_ids'} = $id_arr;
 255       } elsif ( defined $self -> {'individuals'} ) {
 256         my ( $date_str, $time_str );
 257         if( $Config{osname} eq 'MSWin32' ){
 258           $date_str = `date /T`;
 259           $time_str = ' '.`time /T`;
 260         } else {
 261           # Assuming UNIX
 262           $date_str = `date`;
 263         }
 264         chomp($date_str);
 265         chomp($time_str);
 266         my $date_time = $date_str.$time_str;
 267         my ( $columns, $values );
 268         my $res_str = $resampled ? '1' : '0';
 269         if ( defined $model_id ) {
 270           $columns = '(model_id, filename, date, directory, md5sum, resampled)';
 271           $values = "('$model_id', '$file_str', '$date_time', '$dir_str','".
 272               $md5sum."', '$res_str' )";
 273         } else {
 274           $columns = '(filename, date, directory, md5sum, resampled)';
 275           $values = "('$file_str', '$date_time', '$dir_str','".$md5sum."', '$res_str' )";
 276         }
 277         $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
 278                                ".data $columns VALUES $values");
 279         $sth -> execute;
 280         $self -> {'data_id'} = $sth->{'mysql_insertid'};
 281
 282         if ( defined $self -> {'data_id'} ) {
 283           my $values;
 284           my $columns = "( id_key, id )";
 285           if( $#individual_ids >= 0 ) {
 286             $self -> register_di_relation( individual_ids => \@individual_ids );
 287           } else {
 288             my $inds = scalar @{$self -> {'individuals'}};
 289             $dbh -> do( "LOCK TABLES ".$PsN::config -> {'_'} -> {'project'}.
 290                         ".individual WRITE" );
 291 #           $sth = $dbh -> prepare( "SELECT MAX(individual_id)".
 292 #                                   " FROM ".$PsN::config -> {'_'} -> {'project'}.
 293 #                                   ".individual" );
 294             $dbh -> do( 'USE '.$PsN::config -> {'_'} -> {'project'} );
 295             $sth = $dbh -> prepare( "SHOW TABLE STATUS LIKE 'individual'" );
 296             $sth -> execute or debug -> die( message => $sth->errstr ) ;
 297             my $select_arr = $sth -> fetchall_arrayref;
 298             my $first_id_id = $select_arr -> [0][10] ?
 299                 $select_arr -> [0][10] : 0;
 300 #           my $first_id_id = $select_arr -> [0][0] ? ($select_arr -> [0][0] + 1) : 0;
 301             my $last_id_id = $first_id_id + $inds - 1;
 302             for( my $i = 0; $i < $inds; $i++ ) {
 303               if( defined $self -> {'individuals'}[$i] ) {
 304                 my $id_id = $self -> {'individuals'}[$i] -> idnumber;
 305                 $values = $values."," if ( defined $values );
 306                 $values = $values."( $i, $id_id )";
 307               }
 308             }
 309             $sth = $dbh -> prepare( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
 310                                     ".individual $columns VALUES $values" );
 311             $sth -> execute;
 312             $dbh -> do( "UNLOCK TABLES" );
 313             @individual_ids = ($first_id_id .. $last_id_id);
 314             $self -> register_di_relation( individual_ids => \@individual_ids );
 315           }
 316           $self -> {'individual_ids'} = \@individual_ids;
 317         }
 318         $sth -> finish;
 319         $dbh -> disconnect;
 320       }
 321       $data_id = $self -> {'data_id'}; # return the data_id
 322     }
 323 end register_in_database
 324 # }}} register_in_database
 325
 326 # {{{ register_di_relation
 327 start register_di_relation
 328   if ( $PsN::config -> {'_'} -> {'use_database'} and
 329        defined $self -> {'data_id'} and $#individual_ids >= 0 ) {
 330     my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
 331                              ";databse=".$PsN::config -> {'_'} -> {'project'},
 332                              $PsN::config -> {'_'} -> {'user'},
 333                              $PsN::config -> {'_'} -> {'password'},
 334                              {'raiseerror' => 1});
 335     my $sth;
 336     my $values;
 337     my $columns = "( data_id, individual_id )";
 338     foreach my $individual_id ( @individual_ids ) {
 339       if ( defined $individual_id ) {
 340         $values = $values."," if ( defined $values );
 341         $values = $values."(".$self -> {'data_id'}.", $individual_id )";
 342       }
 343     }
 344     $sth = $dbh -> prepare( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
 345                                ".data_individual $columns VALUES $values" );
 346     $sth -> execute;
 347     $sth -> finish if ( defined $sth );
 348     $dbh -> disconnect;
 349   }
 350 end register_di_relation
 351 # }}} register_di_relation
 352
 353 # {{{ full_name
 354
 355 start full_name
 356       {
 357         $full_name = $self -> {'directory'} . $self -> {'filename'};
 358       }
 359 end full_name
 360
 361 # }}}
 362
 363 # {{{ bootstrap
 364
 365 start bootstrap
 366       {
 367         # The bootstrap method draws I<samples> number of boostrap
 368         # samples from the data set. The I<subjects> arguments
 369         # determines the size of each sample (default equals to the
 370         # number of individuals in the original data set). The method
 371         # returns references to three arrays: I<boot_samples_ref>,
 372         # which holds the bootstrap data sets, I<incl_individuals_ref>
 373         # which holds arrays containing the subject identifiers (ID's)
 374         # for the included individuals of each bootstrap data set and
 375         # I<included_keys_ref> which holds the key or index of the
 376         # included individuals. The key or index is an integer
 377         # starting at 1 for the first individual in the original data
 378         # set and increasing by one for each following.
 379         $self -> synchronize;
 380         my @header      = @{$self -> {'header'}};
 381         my $individuals = $self -> {'individuals'};
 382         my $key_ref;
 383
 384         my $status_bar = status_bar -> new( steps => $samples );
 385         ui -> print( category => 'bootstrap',
 386                      message => $status_bar -> print_step,
 387                      newline => 0);
 388
 389         for ( my $i = 1; $i <= $samples; $i++ ) {
 390           my $new_name = defined $name_stub ? $name_stub."_$i.dta" : "bs$i.dta";
 391           $new_name = $directory.'/'.$new_name;
 392           my ( $boot, $incl_ind_ref, $incl_key_ref ) =
 393             $self -> resample( subjects    => \%subjects,
 394                                resume      => $resume,
 395                                new_name    => $new_name,
 396                                target      => $target,
 397                                stratify_on => $stratify_on,
 398                                model_id    => $model_ids[$i-1] );
 399           push( @included_keys, $incl_key_ref );
 400           push( @incl_individuals, $incl_ind_ref );
 401 #         $boot -> renumber_ascending;
 402           push( @boot_samples, $boot );
 403 #          $boot -> synchronize;
 404 #          $boot -> flush;
 405           if( $status_bar -> tick() ){
 406             ui -> print( category => 'bootstrap',
 407                          message => $status_bar -> print_step,
 408                          newline => 0,
 409                          wrap => 0);
 410           }
 411 #         print Dumper \@boot_samples;
 412 #         sleep(10);
 413         }
 414         ui -> print( category => 'bootstrap',
 415                      message => ' ... done' );
 416       }
 417 end bootstrap
 418
 419 # }}} bootstrap
 420
 421 # {{{ resample
 422
 423 start resample
 424       {
 425         $self -> synchronize;
 426         my ( @header, $individuals, @bs_inds, $key_ref, @id_ids, @bs_id_ids );
 427         @id_ids = @{$self -> {'individual_ids'}} if( defined $self -> {'individual_ids'} );
 428         my @subj_keys = keys( %subjects );
 429         if ( $#subj_keys < 0 ) {
 430           debug -> die( message => "sample_size must be defined" );
 431         }
 432         if ( defined $stratify_on ) {
 433           my %strata;
 434           if( $stratify_on =~ /\D/ ){
 435             %strata = %{$self -> factors( column_head => $stratify_on )};
 436             if ( $strata{'Non-unique values found'} eq '1' ) {
 437               debug -> die( message => "Individuals were found to have multiple values in the $stratify_on column. ".
 438                             "The column $stratify_on cannot be used for stratification of the resampling." );
 439             }
 440           } else {
 441             %strata = %{$self -> factors( column => $stratify_on )};
 442             if ( $strata{'Non-unique values found'} eq '1' ) {
 443               debug -> die( message => "Individuals were found to have multiple values in column number $stratify_on. ".
 444                             "Column $stratify_on cannot be used for stratification of the resampling." );
 445             }
 446           }
 447           if ( scalar keys( %subjects) != scalar keys( %strata ) and
 448                not ( $#subj_keys == 0 and defined $subjects{'default'} ) ) {
 449             debug -> die( message => "sample_size must be defined using one default value ".
 450                           "or exactly one value per strata:\n".
 451                           "resampling per STUD=1001,1002,1003\n".
 452                           "use -sample_size='1001=>10,1002=>25,1003=>12' or ".
 453                           "-sample_size='default=>10'");
 454           }
 455           unless ( $resume and -e $new_name ) {
 456             @header = @{$self -> {'header'}};
 457             $individuals = $self -> {'individuals'};
 458             while( my ( $factor, $key_list ) = each %strata ) {
 459               my $keys;
 460               if ( defined $subjects{$factor} ) {
 461                 $keys = $subjects{$factor};
 462               } elsif( defined $subjects{'default'} ) {
 463                 $keys = sprintf( "%.0f",($subjects{'default'}*
 464                                (scalar(@{$key_list}))/($self -> count_ind())) );
 465               } else {
 466                 debug -> die( message => "A sample size for strata $factor could not be found ".
 467                               "and no default sample size was set" );
 468               }
 469               for ( my $i = 0; $i < $keys; $i++ ) {
 470                 my $list_ref = random_uniform_integer(1,0,(scalar(@{$key_list}) - 1));
 471                 push( @bs_inds, $individuals ->
 472                       [ $key_list -> [$list_ref] ] -> copy );
 473                 push( @included_keys, $key_list -> [$list_ref] );
 474                 push( @incl_individuals, $individuals ->
 475                       [ $key_list -> [$list_ref] ] -> idnumber );
 476                 push( @bs_id_ids, $id_ids[ $key_list -> [$list_ref] ] );
 477               }
 478             }
 479
 480             $boot = data -> new( header      => \@header,
 481                                  idcolumn    => $self -> {'idcolumn'},
 482                                  ignoresign  => $self -> {'ignoresign'},
 483                                  individuals => \@bs_inds,
 484                                  filename    => $new_name,
 485                                  ignore_missing_files => 1,
 486                                  target      => 'mem' );
 487             $boot -> renumber_ascending;
 488             $boot -> _write;
 489             $boot -> flush;
 490             #$boot -> target( $target );
 491           } else {
 492             # If we are resuming, we still need to generate the
 493             # pseudo-random sequence and initiate a data object
 494             while( my ( $factor, $key_list ) = each %strata ) {
 495               my $keys;
 496               if ( defined $subjects{$factor} ) {
 497                 $keys = $subjects{$factor};
 498               } elsif( defined $subjects{'default'} ) {
 499                 $keys = sprintf( "%.0f",($subjects{'default'}*
 500                                (scalar(@{$key_list}))/($self -> count_ind())) );
 501               } else {
 502                 debug -> die( message => "A sample size for strata $factor could not be found ".
 503                               "and no default sample size was set" );
 504               }
 505               for ( my $i = 0; $i < $keys; $i++ ) {
 506                 my $list_ref = random_uniform_integer(1,0,(scalar(@{$key_list}) - 1));
 507               }
 508             }
 509             $boot = data -> new( idcolumn    => $self -> {'idcolumn'},
 510                                  ignoresign  => $self -> {'ignoresign'},
 511                                  filename    => $new_name,
 512                                  ignore_missing_files => 1,
 513                                  target      => $target );
 514
 515             $boot -> _write;
 516             $boot -> flush;
 517           }
 518         } else {
 519           my $size;
 520           if( defined $subjects{'default'} ) {
 521             $size = $subjects{'default'};
 522           } else {
 523             debug -> die( message => "No default sample size was set" );
 524           }
 525           unless ( $resume and -e $new_name ) {
 526             @header = @{$self -> {'header'}};
 527             $individuals = $self -> {'individuals'};
 528             for ( my $i = 1; $i <= $size; $i++ ) {
 529               $key_ref = random_uniform_integer(1,0,scalar @{$individuals}-1);
 530               push( @bs_inds, $individuals -> [ $key_ref ] -> copy );
 531               push( @included_keys, $key_ref );
 532               push( @incl_individuals, $individuals -> [ $key_ref ] -> idnumber );
 533               push( @bs_id_ids, $id_ids[ $key_ref ] );
 534             }
 535
 536             # MUST FIX: If a file already exists with the same name,
 537             # the created bs data set will be appended to this. IT
 538             # MUST BE OVERWRITTEN!
 539             $boot = data -> new( header      => \@header,
 540                                  idcolumn    => $self -> {'idcolumn'},
 541                                  ignoresign  => $self -> {'ignoresign'},
 542                                  individuals => \@bs_inds,
 543                                  filename    => $new_name,
 544                                  ignore_missing_files => 1,
 545                                  target      => 'mem' );
 546             $boot -> renumber_ascending;
 547             $boot -> _write;
 548             $boot -> target( $target );
 549           } else {
 550             # If we are resuming, we still need to generate the
 551             # pseudo-random sequence and initiate a data object
 552             for ( my $i = 1; $i <= $size; $i++ ) {
 553               random_uniform_integer(1,0,scalar @{$individuals}-1)
 554             }
 555             $boot = data -> new( idcolumn    => $self -> {'idcolumn'},
 556                                  ignoresign  => $self -> {'ignoresign'},
 557                                  filename    => $new_name,
 558                                  ignore_missing_files => 1,
 559                                  target      => $target );
 560
 561             $boot -> _write;
 562             $boot -> flush;
 563           }
 564           if( $target eq 'disk'){
 565             $boot -> flush;
 566           }
 567         }
 568         $boot -> register_in_database( individual_ids => \@bs_id_ids,
 569                                        resampled      => 1,
 570                                        model_id       => $model_id );
 571       }
 572 end resample
 573
 574 # }}} resample
 575
 576 # {{{ case_deletion
 577
 578 start case_deletion
 579       {
 580         # case_deletion creates subsets of the data. The number of
 581         # subsets is specified by the bins argument. The individuals
 582         # of each subset is selected randomly or in ascending
 583         # numerical order depending on the selection argument that can
 584         # be either 'consecutive' or 'random'. case_column must be
 585         # specified to give the method something to base the selection
 586         # on. Valid case_column values are either the column number
 587         # (pure digits) or the name of the column in the (optional)
 588         # header row.
 589         $self -> synchronize;
 590         my @header    = @{$self -> {'header'}};
 591         if ( not defined $case_column ) {
 592           debug -> die( message => "case_column must be specified" );
 593         } else {
 594           if ( not $case_column =~ /^\d/ ) {
 595             for ( my $i = 0; $i <= $#header; $i++ ) {
 596               $case_column = $i+1 if ( $header[$i] eq $case_column );
 597             }
 598           }
 599         }
 600         $bins = defined $bins ? $bins :
 601           scalar keys %{$self -> factors( column => $case_column)};
 602         my %factors   = %{$self -> factors( column => $case_column )};
 603         if ( $factors{'Non-unique values found'} eq '1' ) {
 604           debug -> die( message => "Individuals were found to have multiple values in column number $case_column. ".
 605                         "Column $case_column cannot be used for case deletion." );
 606         }
 607
 608         my $maxbins   = scalar keys %factors;
 609         my @ftrs      = sort { $a <=> $b } keys %factors;
 610         my $individuals = $self -> {'individuals'};
 611         my $maxkey    = scalar @{$individuals} - 1;
 612
 613         my ( @tmp_ftrs, @binsize ) =
 614             ((),());
 615         my ( $k, $j, $i ) = ( 0, 0, 0 );
 616         # Create the binsizes
 617         for ( $j = 0; $j < $maxbins; $j++ ) {
 618           $binsize[ $k++ ]++;
 619           $k = 0 if( $k >= $bins );
 620         }
 621         $self -> _fisher_yates_shuffle( array => \@ftrs ) if( $selection eq 'random' );
 622         for ( $k = 0; $k < $bins; $k++ ) {
 623           for ( $j = 0; $j < $binsize[ $k ]; $j++ ) {
 624 #           print "SK: ",$skipped_keys[ $k ]," F: ",$factors{ $ftrs[ $i ] },"\n";
 625             push( @{$skipped_keys[ $k ]}, @{$factors{ $ftrs[ $i ] }} );
 626             push( @{$skipped_values[ $k ]}, $ftrs[ $i++ ] );
 627           }
 628         }
 629
 630         for ( $k = 0; $k < $bins; $k++ ) {
 631           my @cd_inds = ();
 632           my @del_inds = ();
 633         SELKEYS: foreach my $key ( 0..$maxkey ) {
 634           foreach my $skipped ( @{$skipped_keys[ $k ]} ) {
 635             if ( $key == $skipped ) {
 636               push( @{$skipped_ids[ $k ]}, $individuals ->
 637                     [ $skipped ] -> idnumber );
 638               push( @del_inds, $individuals -> [ $key ] -> copy );
 639               next SELKEYS;
 640             }
 641           }
 642           push( @cd_inds, $individuals -> [ $key ] -> copy );
 643         }
 644           # Set ignore_missing_files = 1 to make it possible to get the result
 645           # in memory only
 646           my $newdata = data ->
 647               new ( header      => \@header,
 648                     ignoresign  => $self -> {'ignoresign'},
 649                     idcolumn    => $self -> {'idcolumn'},
 650                     individuals => \@cd_inds,
 651                     target      => $target,
 652                     filename    => $directory.'/cdd_'.($k+1).'.dta',
 653                     ignore_missing_files => 1 );
 654           my $deldata = data ->
 655               new ( header      => \@header,
 656                     ignoresign  => $self -> {'ignoresign'},
 657                     idcolumn    => $self -> {'idcolumn'},
 658                     individuals => \@del_inds,
 659                     target      => $target,
 660                     filename    => $directory.'/rem_'.($k+1).'.dta',
 661                     ignore_missing_files => 1 );
 662           push( @subsets, $newdata );
 663           push( @remainders, $deldata );
 664           $newdata -> _write;
 665           $newdata -> flush;
 666           $deldata -> _write;
 667           $deldata -> flush;
 668         }
 669       }
 670 end case_deletion
 671
 672 # }}} case_deletion
 673
 674 # {{{ copy
 675 start copy
 676       {
 677         # filename: new data file name.
 678         #
 679         # target: keep the copy in memory ('mem') or write it to disk and flush the memory ('disk').
 680
 681         ($directory, $filename) = OSspecific::absolute_path( $directory, $filename );
 682
 683         # Clone self into new data object. Why don't the individuals get cloned too?
 684         # strange. need to set synced to 0 AND set the {'individuals'} to undef.
 685         cp($self -> full_name, $directory.$filename );
 686         $new_data = Storable::dclone( $self );
 687         $new_data -> {'synced'} = 0;
 688         $new_data -> {'individuals'} = undef;
 689         $new_data -> synchronize;
 690
 691         # Set the new file name for the copy
 692         $new_data -> directory( $directory );
 693         $new_data -> filename( $filename );
 694       }
 695 end copy
 696
 697 # }}} copy
 698
 699 # {{{ column_to_array
 700
 701 start column_to_array
 702 {
 703   $self -> synchronize;
 704
 705   if ( not $column =~ /^\d/ ) {
 706     $column = $self -> {'column_head_indices'} -> {$column} - 1;
 707   }
 708
 709   if( $column < 0 or $column > $#{$self -> {'header'}} ){
 710     return [];
 711   }
 712
 713   #have separate loops for case without filter and with filter,
 714   #to minimize risk of errors.
 715
 716   if (scalar(@filter)==0){
 717       foreach my $individual ( @{$self -> individuals} ){
 718           foreach my $individual_row( @{$individual -> subject_data} ){
 719               my @row = split(/,/ , $individual_row);
 720               push( @array, $row[$column] );
 721           }
 722       }
 723   } else {
 724       my $index=0;
 725       foreach my $individual ( @{$self -> individuals} ){
 726           foreach my $individual_row( @{$individual -> subject_data} ){
 727               if ($filter[$index] > 0){
 728                   my @row = split(/,/ , $individual_row);
 729                   push( @array, $row[$column] );
 730               }
 731               $index++;
 732               if ($index > $#filter){
 733                   $index=0;
 734               }
 735           }
 736       }
 737       unless ($index == 0){
 738           'debug' -> die(message =>"Number of rows in dataset was not a multiple of ".
 739                          "filter length.");
 740       }
 741   }
 742
 743 }
 744 end column_to_array
 745
 746 # }}}
 747
 748 # {{{ count_ind
 749
 750 start count_ind
 751       {
 752         # Returns the number of individuals in the data set.
 753         $self -> synchronize;
 754         if( defined $self -> individuals() ) {
 755           $num = scalar @{$self -> individuals()};
 756         } else {
 757           debug -> die( message => "No individuals found in file ".
 758                         $self -> filename() );
 759         }
 760       }
 761 end count_ind
 762
 763 # }}} count_ind
 764
 765 # {{{ diff
 766
 767 start diff
 768 {
 769   $self -> synchronize;
 770
 771   my $first_id = $self -> {'individuals'}[0];
 772
 773   debug -> die( message => "No individuals defined in data object based on ".
 774                 $self -> full_name ) unless ( defined $first_id );
 775
 776   # Check if $column(-index) is defined and valid, else try to find index
 777   # using column_head
 778
 779   my @data_row = split( /,/, $first_id -> subject_data -> [0] );
 780   if( $#columns >= 0 ) {
 781     foreach my $column ( @columns ) {
 782       unless ( defined $column && defined( $data_row[$column-1] ) ) {
 783         debug -> die( message => "Error in data -> factors: ".
 784                       "invalid column number: \"$column\"\n".
 785                       "Valid column numbers are 1 to ".
 786                       scalar @{$first_id -> subject_data ->[0]}."\n" );
 787       }
 788     }
 789   } elsif ( $#column_heads >= 0 ) {
 790     foreach my $column_head ( @column_heads ) {
 791       unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
 792         debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
 793                       "Valid column headers are (in no particular order):\n".
 794                       join(', ',keys(%{$self -> {'column_head_indices'}})) );
 795       } else {
 796         my $column = $self -> {'column_head_indices'}{$column_head};
 797         push( @columns, $column );
 798         debug -> warn( level   => 2,
 799                        message => "$column_head is in column number $column" );
 800       }
 801     }
 802   } else {
 803     debug -> die( message => "No column or column_head defined" );
 804   }
 805
 806   if( $global_largest or $global_smallest or
 807       $largest_per_individual or $smallest_per_individual ) {
 808     if( not scalar @{$self -> {'individuals'}} == scalar @{$against_data -> individuals} ) {
 809       debug -> die( message => "Both data object must hold the same number of individuals ".
 810                     "and observations when calling data -> diff" );
 811     }
 812     for( my $i = 0; $i < scalar @{$self -> {'individuals'}}; $i++ ) {
 813       my %id_diffs = %{$self -> {'individuals'}[$i] ->
 814                            diff( against_individual => $against_data -> individuals -> [$i],
 815                                  columns            => \@columns,
 816                                  absolute_diff      => $absolute_diff,
 817                                  diff_as_fraction   => $diff_as_fraction,
 818                                  largest            => ( $global_largest or $largest_per_individual ),
 819                                  smallest           => ( $global_smallest or $smallest_per_individual ) )};
 820       if( $global_largest ) {
 821         for( my $j = 0; $j <= $#columns; $j++ ) {
 822           my $label = defined $column_heads[$j] ? $column_heads[$j] : $columns[$j];
 823           if( not defined $diff_results{$label} or not defined $diff_results{$label}{'diff'} or
 824               $id_diffs{$columns[$j]}{'diff'} > $diff_results{$label}{'diff'} ) {
 825             $diff_results{$label}{'diff'} = $id_diffs{$columns[$j]}{'diff'};
 826             $diff_results{$label}{'self'} = $id_diffs{$columns[$j]}{'self'};
 827             $diff_results{$label}{'test'} = $id_diffs{$columns[$j]}{'test'};
 828           }
 829         }
 830       }
 831     }
 832   } else {
 833     die "data -> diff is only implemented for finding the largest difference at any observation at this point\n";
 834   }
 835 }
 836 end diff
 837
 838 # }}} diff
 839
 840 # {{{ filename
 841 start filename
 842       {
 843         if ( defined $parm and $parm ne $self -> {'filename'} ) {
 844           $self -> {'filename'} = $parm;
 845           $self -> {'data_id'} = undef;
 846 #         $self -> _write;
 847         }
 848       }
 849 end filename
 850 # }}} filename
 851
 852 # {{{ fractions
 853
 854 start fractions
 855       {
 856         my %factors = $self -> factors( 'return_occurences' => 1,
 857                                         'unique_in_individual' => $unique_in_individual,
 858                                         'column_head' => $column_head,
 859                                         'column' => $column);
 860
 861         my $sum = 0;
 862         while (my ($factor, $amount) = each %factors) {
 863           if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
 864             next;
 865           } else {
 866             $sum += $amount;
 867           }
 868         }
 869         while (my ($factor, $amount) = each %factors) {
 870           if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
 871             next;
 872           } else {
 873             $fractions{$factor} = $amount/$sum;
 874           }
 875         }
 876       }
 877 end fractions
 878
 879 # }}} fractions
 880
 881 # {{{ factors
 882
 883 start factors
 884       {
 885         # Either column (number, starting at 1) or column_head must be specified.
 886         #
 887         # The default behaviour is to return a hash with the factors as keys
 888         # and as values references to arrays with the order numbers (not the ID numbers)
 889         # of the individuals that contain this factor
 890         #
 891         # If unique_in_individual is true (1), the returned hash will contain
 892         # an element with key 'Non-unique values found' and value 1 if any
 893         # individual contain more than one value in the specified column.
 894         #
 895         # Return occurences will calculate the occurence of each
 896         # factor value. Several occurences in one individual counts as
 897         # one occurence. The elements of the returned hash will have the factors
 898         # as keys and the number of occurences as values.
 899         #
 900
 901         $self -> synchronize;
 902
 903         # Check if $column(-index) is defined and valid, else try to find index
 904         # using column_head
 905         my $first_id = $self -> {'individuals'}[0];
 906
 907         debug -> die( message => "No individuals defined in data object based on ".
 908                       $self -> full_name ) unless ( defined $first_id );
 909
 910         my @data_row = split( /,/, $first_id -> subject_data -> [0] );
 911         unless ( defined $column && defined( $data_row[$column-1] ) ) {
 912           unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
 913             debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
 914                           "or invalid column number: \"$column\".\n".
 915                           "Valid column numbers are 1 to ".scalar @data_row ."\n".
 916                           "Valid column headers are (in no particular order):\n".
 917                           join(', ',keys(%{$self -> {'column_head_indices'}})) );
 918           } else {
 919             $column = $self -> {'column_head_indices'}{$column_head};
 920             debug -> warn( level   => 2,
 921                            message => "$column_head is in column number $column" );
 922           }
 923         }
 924
 925         my $key = 0;
 926         foreach my $individual ( @{$self -> {'individuals'}} ) {
 927           my @ifactors = keys %{$individual -> factors( column => $column )};
 928           if ( scalar @ifactors > 1 and $unique_in_individual ) {
 929             %factors = ( 'Non-unique values found' => 1 );
 930             last;
 931           }
 932           debug -> die( message => "No value found in column $column in individual ".
 933                         $individual -> idnumber ) if ( scalar @ifactors == 0 );
 934
 935           # Return occurences will calculate the occurence of each
 936           # factor value. Several occurences in one individual counts as
 937           # one occurence.
 938
 939           if ( $return_occurences ) {
 940             foreach my $ifactor ( @ifactors ) {
 941               $factors{$ifactor}++;
 942             }
 943           } else {
 944             foreach my $ifactor ( @ifactors ) {
 945               push( @{$factors{$ifactor}}, $key );
 946             }
 947           }
 948           $key++;
 949         }
 950       }
 951 end factors
 952
 953 # }}} factors
 954
 955 # {{{ find_individual
 956
 957 # start find_individual
 958 #         foreach my $tmp_ind ( @{$self -> individuals} ) {
 959 #         if ( $tmp_ind -> key == $key ) {
 960 #           $individual = $tmp_ind;
 961 #           last;
 962 #         }
 963 #       }
 964 #         if ( defined $individual ) {
 965 #         if ( $copy ) {
 966 #           $individual = $individual -> copy;
 967 #         }
 968 #       } else {
 969 #         print "No individual with key $key found in call to ".
 970 #           "data -> find_individual\n" if ( $self -> debug );
 971 #         }
 972 # end find_individual
 973
 974 # }}}
 975
 976 # {{{ format_data
 977
 978 start format_data
 979       {
 980         my $header = $self -> {'header'};
 981
 982         # format the data for NONMEM (simple comma-separated layout)
 983         if ( defined $self -> {'comment'} ) {
 984           my @comment   = @{$self -> {'comment'}};
 985           for ( @comment ) {
 986             push( @form_data );
 987           }
 988         }
 989
 990         my $wrap = ( defined $self -> {'wrap_column'} and
 991                      defined $self -> {'cont_column'} );
 992
 993         my @primary_columns = defined $self -> {'primary_columns'} ?
 994           @{$self -> {'primary_columns'}} : ();
 995         my @secondary_columns = defined $self -> {'secondary_columns'} ?
 996           @{$self -> {'secondary_columns'}} : ();
 997         if ( defined $header and defined $self -> {'ignoresign'} ) {
 998           my $istr;
 999           if ( $self -> {'ignoresign'} ne '@' ) {
1000             $istr = $self -> {'ignoresign'};
1001           }
1002           if ( $wrap ) {
1003             my @h_data;
1004             for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
1005               my $sstr = $istr;
1006               for ( my $j = 0; $j < scalar @{$secondary_columns[$i]} ; $j++ ) {
1007                 my $jstr = $j == 0 ? '' : ',';
1008                 $sstr = $sstr.$jstr.$secondary_columns[$i][$j][0];
1009               }
1010               push( @h_data, $sstr."\n" );
1011             }
1012             push( @form_data, @h_data );
1013             my $pstr = $istr;
1014             for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
1015               my $jstr = $i == 0 ? '' : ',';
1016               $pstr = $pstr.$jstr.$primary_columns[$i][0];
1017             }
1018             push( @form_data, $pstr."\n" );
1019           } else {
1020             push( @form_data, $istr.join(',',@{$self -> {'header'}})."\n" );
1021           }
1022         }
1023         if ( $wrap ) {
1024           foreach my $individual ( @{$self -> {'individuals'}} ) {
1025             foreach my $row ( @{$individual -> subject_data} ) {
1026               my @r_data;
1027               for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
1028                 my $sstr = '';
1029                 for ( my $j = 0; $j < scalar @{$secondary_columns[$i]} ; $j++ ) {
1030                   my $jstr = $j == 0 ? '' : ',';
1031                   if ( $secondary_columns[$i][$j][0] eq 'CONT' ) {
1032                     $sstr = $sstr.$jstr.'1';
1033                   } else {
1034                     my @data_row = split( /,/, $row );
1035                     $sstr = $sstr.$jstr.$data_row[$secondary_columns[$i][$j][1]];
1036                   }
1037                 }
1038                 push( @r_data, $sstr."\n" );
1039               }
1040               push( @form_data, @r_data );
1041               my $pstr = '';
1042               for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
1043                 my $jstr = $i == 0 ? '' : ',';
1044                 if ( $primary_columns[$i][0] eq 'CONT' ) {
1045                   $pstr = $pstr.$jstr.'0';
1046                 } else {
1047                   my @data_row = split( /,/, $row );
1048                   $pstr = $pstr.$jstr.$data_row[$primary_columns[$i][1]];
1049                 }
1050               }
1051               push( @form_data, $pstr."\n" );
1052             }
1053           }
1054         } else {
1055           foreach my $individual ( @{$self -> {'individuals'}} ) {
1056             foreach my $row ( @{$individual -> subject_data} ) {
1057               push( @form_data, $row ."\n" );
1058             }
1059           }
1060         }
1061       }
1062 end format_data
1063
1064 # }}} format_data
1065
1066 # {{{ drop_dropped
1067
1068 start drop_dropped
1069       {
1070         # This method removes columns that has '=DROP' value in the
1071         # model header as given by $INPUT. The model header must be
1072         # transfered to this method through the model_header
1073         # argument. The model_header argument should be a
1074         # two-dimensional array where each position in the first
1075         # dimension should be a reference to a 1*2 array holding the
1076         # column name and value. Any ignore-sign must be removed.
1077
1078         debug -> die( message => 'model header must be defined' )
1079           if ( $#model_header < 0 );
1080         # Important that the drop_dropped method of the model::problem
1081         # class is in sync with this method.
1082         $self -> synchronize;
1083
1084         $self -> {'header'} = [];
1085         my @drop;
1086         my $counter = 1;
1087         for( my $i = 0; $i <= $#model_header; $i++ ) {
1088           $self -> {'idcolumn'} = $counter if ( $model_header[$i][0] eq 'ID' );
1089           if( ( $model_header[$i][1] eq 'DROP' or
1090                 $model_header[$i][1] eq 'SKIP' ) and
1091               not $model_header[$i][0] =~ /DAT(E|1|2|3)/ ) {
1092             push( @drop, 1 );
1093           } else {
1094             $counter++;
1095             push( @drop, 0 );
1096             push( @{$self -> {'header'}}, $model_header[$i][0] );
1097           }
1098         }
1099
1100         foreach my $individual ( @{$self -> {'individuals'}} ) {
1101           $individual -> drop_columns( drop => \@drop );
1102         }
1103
1104         $self -> {'synced'} = 0;
1105 #       $Data::Dumper::Maxdepth = 2;
1106 #       die Dumper $self;
1107 #       die Dumper $self -> {'individuals'};
1108       }
1109 end drop_dropped
1110
1111 # }}} drop_dropped
1112
1113 # {{{ wrap
1114 start wrap
1115       {
1116         $self -> synchronize;
1117         $self -> cont_column( $cont_column ) if ( defined $cont_column );
1118         $self -> wrap_column( $wrap_column ) if ( defined $wrap_column );
1119         $self -> prepare_wrap( model_header => \@model_header );
1120         @secondary_columns = @{$self -> {'secondary_columns'}}
1121           if ( defined $self -> {'secondary_columns'} );
1122         @primary_columns = @{$self -> {'primary_columns'}}
1123           if ( defined $self -> {'primary_columns'} );
1124         $self -> synced(0);
1125       }
1126 end wrap
1127 # }}} wrap
1128
1129 # {{{ unwrap
1130 start unwrap
1131       {
1132         $self -> {'cont_column'} = undef;
1133         $self -> {'wrap_column'} = undef;
1134         $self -> {'secondary_columns'} = undef;
1135         $self -> {'primary_columns'} = undef;
1136       }
1137 end unwrap
1138 # }}} unwrap
1139
1140 # {{{ prepare_wrap
1141
1142 start prepare_wrap
1143       {
1144         my $cont_column = $self -> {'cont_column'};
1145         my $wrap_column = $self -> {'wrap_column'};
1146         debug -> die( message => 'cont_column ('.$cont_column.') must be less or equal '.
1147                       'to the requested number of columns in each row ('.
1148                       ($wrap_column).')' )
1149           if ( $cont_column > $wrap_column );
1150         my @header;
1151         if ( scalar @model_header > 0 ) {
1152           @header = @model_header;
1153         } else {
1154           @header = @{$self -> {'header'}};
1155         }
1156
1157         my ( @primary, @secondary, @date_columns );
1158
1159         for ( my $i = 0; $i <= $#header; $i++ ) {
1160           my $name  = ref( $header[$i] ) eq 'ARRAY' ? $header[$i][0] : $header[$i];
1161           my $value = ref( $header[$i] ) eq 'ARRAY' ? $header[$i][1] : undef;
1162           next if ( $name eq 'ID' );
1163           my $found = 0;
1164           foreach my $prim ( @primary_column_names ) {
1165             if ( not $found and
1166                  ( $name eq $prim or $value eq $prim ) ) {
1167               push( @primary, [$name, $i, $value] );
1168               $found = 1;
1169               my $col = ($#primary+2)>= $cont_column ? ($#primary+3) : ($#primary+2);
1170               push( @date_columns, $col ) if ( $name =~ /DAT(E|1|2|3)/ );
1171             }
1172           }
1173           push( @secondary, [$name, $i, $value] ) if ( not $found );
1174         }
1175
1176         my $prim_num = scalar @primary;
1177         debug -> die( message => 'The number of primary columns (that need to '.
1178                       'be part of the row with CONT=0) ('.($prim_num+1).
1179                       ') is larger than the required number of columns (wrap_column='.
1180                       $wrap_column.') - 1' )
1181           if ( scalar $prim_num > ($wrap_column-2) );
1182
1183         my ( $i, $dum ) = ( 0, 1 );
1184         my @tmp;
1185         for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1186           if( $j == 1 ) {
1187             push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1188           } elsif ( $j == $wrap_column ) {
1189             if ( $j == $cont_column ) {
1190               push( @tmp, ['CONT', undef] );
1191             } else {
1192               my $val;
1193               if ( defined $primary[$i] ) {
1194                 $val = $primary[$i];
1195               } elsif ( defined $secondary[0] ) {
1196                 $val = shift(@secondary);
1197               } else {
1198                 $val = ['XX'.$dum++,$self -> {'idcolumn'}-1];
1199               }
1200               push( @tmp, $val );
1201               $i++;
1202             }
1203             push( @{$self -> {'primary_columns'}}, @tmp );
1204           } else {
1205             if ( $j == $cont_column ) {
1206               push( @tmp, ['CONT', undef] );
1207             } else {
1208               if ( $i <= $#primary ) {
1209                 push( @tmp, $primary[$i] );
1210                 $i++;
1211               } else {
1212                 my $val = defined $secondary[0] ? shift(@secondary) :
1213                   ['XX'.$dum++,$self -> {'idcolumn'}-1];
1214                 push( @tmp, $val );
1215               }
1216             }
1217           }
1218         }
1219
1220         my $i = 0;
1221         while ( $i <= $#secondary ) {
1222           my @tmp;
1223           for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1224             if( $j == 1 ) {
1225               push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1226             } elsif ( $j == $wrap_column ) {
1227               if ( $j == $cont_column ) {
1228                 push( @tmp, ['CONT', undef] );
1229               } else {
1230                 my $val = defined $secondary[$i] ? $secondary[$i] :
1231                   ['XX'.$dum++,$self -> {'idcolumn'}-1];
1232                 push( @tmp, $val );
1233                 $i++;
1234               }
1235               unshift( @{$self -> {'secondary_columns'}}, \@tmp );
1236             } else {
1237               if ( $j == $cont_column ) {
1238                 push( @tmp, ['CONT', undef] );
1239               } else {
1240                 my $isdate = 0;
1241                 if ( $#date_columns >= 0 ) {
1242                   foreach my $col ( @date_columns ) {
1243                     # This is a date column which may have to be dropped
1244                     # and thus will not appear as a secondary
1245                     # column. Nothing should be pushed. The indexes in
1246                     # model::problem::pk::_format_record will be ok.
1247                     $isdate = 1 if ( $col == $j ) ;
1248                   }
1249                 }
1250                 if ( $isdate ) {
1251                   push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1252                 } else {
1253                   if ( $i <= $#secondary ) {
1254                     push( @tmp, $secondary[$i] );
1255                     $i++;
1256                   } else {
1257                     push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1258                   }
1259                 }
1260               }
1261             }
1262           }
1263         }
1264       }
1265 end prepare_wrap
1266
1267 # }}} prepare_wrap
1268
1269 # {{{ have_missing_data
1270
1271 start have_missing_data
1272       {
1273         # Either I<column> or I<column_head> must be specified.
1274         #
1275         # This method looks through the data column with index I<column> or
1276         # (optional) header name I<column_head> and returns O if no missing
1277         # data indicator was found or 1 otherwise.
1278
1279         $self -> synchronize;
1280         my $first_id = $self -> {'individuals'}[0];
1281         debug -> die( message => "No individuals defined in data object based on ".
1282                       $self -> full_name ) unless ( defined $first_id );
1283         my @data_row = split( /,/ , $first_id -> subject_data -> [0] );
1284         unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1285           unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1286             die "Error in data -> have_missing_data: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1287           } else {
1288             $column = $self -> {'column_head_indices'}{$column_head};
1289           }
1290         }
1291         $self -> flush if ( $self -> {'target'} eq 'disk' );
1292
1293         # In case anyone wonders, the ternary statment ( bool ? true :
1294         # false ) below will possibly make a minuscle memory
1295         # optimization. But hey, why not :)
1296
1297         $return_value = defined $self -> {'have_missing_data'} ? $self -> {'have_missing_data'} -> {$column} : 0;
1298       }
1299 end have_missing_data
1300
1301 # }}} have_missing_data
1302
1303 # {{{ merge
1304 start merge
1305       {
1306         #$self -> synchronize;
1307         push( @{$self -> {'individuals'}}, @{$mergeobj -> individuals} );
1308       }
1309 end merge
1310 # }}} merge
1311
1312 # {{{ max
1313
1314 start max
1315       {
1316         # Either column or column_head must be specified. Column_head must be a string that
1317         # identifies a column in the (optional ) data file header.
1318
1319 # The if-statement below used to be a cache of allready calculated
1320 # means. But since individuals can be accessed in so many ways, we
1321 # don't know when this cache should be updated. Its easier to
1322 # recalculate the max. Maybe we can include this optimization in the
1323 # future, if it turns out to be a bottleneck
1324 #       my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1325 #       if ( defined $self -> {'max'}[$tmp_column] ) {
1326 #         $return_value = $self -> {'max'}[$tmp_column] ;
1327 #       } else {
1328           $self -> synchronize;
1329           my $first_id = $self -> {'individuals'}[0];
1330           debug -> die( message => "data -> max: No individuals defined in data object based on " .
1331                         $self -> full_name ) unless defined $first_id;
1332
1333           my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1334
1335           unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1336             unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1337               die "Error in data -> max: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1338             } else {
1339               $column = $self -> {'column_head_indices'}{$column_head};
1340             }
1341           }
1342           foreach my $individual ( @{$self -> {'individuals'}} ) {
1343             my $ifactors = $individual -> factors( 'column' => $column );
1344             foreach ( keys %{$ifactors} ) {
1345               next if ( $_ == $self -> {'missing_data_token'} );
1346               if ( defined ($return_value) ) {
1347                 $return_value = $_ > $return_value ? $_ : $return_value;
1348               } else {
1349                 $return_value = $_;
1350               }
1351             }
1352           }
1353
1354 #         $self -> {'max'}[$column] = $return_value;
1355           $self -> flush if ( $self -> {'target'} eq 'disk' );
1356 #       }
1357       }
1358 end max
1359
1360 # }}} max
1361
1362 # {{{ min
1363
1364 start min
1365       {
1366         # See L</max>.
1367         my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1368
1369 # The if-statement below used to be a cache of allready calculated
1370 # means. But since individuals can be accessed in so many ways, we
1371 # don't know when this cache should be updated. Its easier to
1372 # recalculate the min. Maybe we can include this optimization in the
1373 # future, if it turns out to be a bottleneck
1374 #       if ( defined $self -> {'min'}[$tmp_column] ) {
1375 #         $return_value = $self -> {'min'}[$tmp_column] ;
1376 #       } else {
1377           $self -> synchronize;
1378           my $first_id = $self -> {'individuals'}[0];
1379           die "data -> min: No individuals defined in data object based on ",
1380             $self -> full_name,"\n" unless defined $first_id;
1381
1382           my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1383
1384           unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1385             unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1386               die "Error in data -> min: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1387             } else {
1388               $column = $self -> {'column_head_indices'}{$column_head};
1389             }
1390           }
1391           foreach my $individual ( @{$self -> {'individuals'}} ) {
1392             my $ifactors = $individual -> factors( 'column' => $column );
1393             foreach ( keys %{$ifactors} ) {
1394               next if ( $_ == $self -> {'missing_data_token'} );
1395               if ( defined ($return_value) ) {
1396                 $return_value = $_ < $return_value ? $_ : $return_value;
1397               } else {
1398                 $return_value = $_;
1399               }
1400             }
1401           }
1402 #         $self -> {'min'}[$column] = $return_value;
1403           $self -> flush if ( $self -> {'target'} eq 'disk' );
1404 #       }
1405       }
1406 end min
1407
1408 # }}} min
1409
1410 # {{{ median
1411
1412 start median
1413       {
1414           # See L</max>.
1415         $self -> synchronize;
1416         my $first_id = $self -> {'individuals'}[0];
1417         die "data -> median: No individuals defined in data object based on ",
1418           $self -> full_name,"\n" unless defined $first_id;
1419
1420         my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1421
1422         unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1423           unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1424             die "Error in data -> median: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1425           } else {
1426             $column = $self -> {'column_head_indices'}{$column_head};
1427           }
1428         }
1429
1430         if( defined $self -> {'median'}[$column] ){
1431           return $self -> {'median'}[$column];
1432         }
1433
1434         my @median_array;
1435
1436         foreach my $individual ( @{$self -> {'individuals'}} ) {
1437           if( $unique_in_individual ){
1438             my $ifactors = $individual -> factors( 'column' => $column );
1439
1440             foreach ( keys %{$ifactors} ) {
1441               next if ( $_ == $self -> {'missing_data_token'} );
1442               push( @median_array, $_ );
1443             }
1444           } else {
1445             my $ifactors = $individual -> subject_data;
1446
1447             for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1448               my @data_row = split( /,/ , $ifactors -> [$i] );
1449               next if ( $data_row[$column-1] == $self -> {'missing_data_token'} );
1450               push(@median_array, $data_row[$column-1]);
1451             }
1452           }
1453         }
1454         @median_array = sort {$a <=> $b} @median_array ;
1455
1456         if( @median_array % 2 ){
1457           $return_value = $median_array[$#median_array / 2];
1458         } else {
1459           $return_value = ( $median_array[@median_array / 2] +
1460                             $median_array[(@median_array - 2) / 2] ) / 2;
1461         }
1462
1463         $self -> {'median'}[$column] = $return_value;
1464       }
1465 end median
1466
1467 # }}} median
1468
1469 # {{{ mean
1470
1471 start mean
1472   {
1473      # Returns mean value of a column
1474      # If a individual contains more then 1 value (i.e. if an
1475      # individual has different values in different samples a mean
1476      # value of all individuals if calculate first, then the mean
1477      # value of the column If hi_cutoff is defined the mean function
1478      # will cut all value below the cutoff, and set their value to
1479      # 0. It's used to calculate the HI-mean/LOW-mean of a column for
1480      # e.g. Hockey-stick covariates If both hi_cutoff and low_cutoff
1481      # are defined only the hi_cutoff will be used.  See L</max>.
1482      my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1483      $self -> synchronize;
1484      my $first_id = $self -> {'individuals'}[0];
1485        die "data -> mean: No individuals defined in data object based on ",
1486         $self -> full_name,"\n" unless defined $first_id;
1487
1488      my @data_row = split( /,/, $first_id -> subject_data ->[0] );
1489
1490      unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1491        unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1492          die "Error in data -> mean: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1493        } else {
1494               $column = $self -> {'column_head_indices'}{$column_head};
1495          }
1496      }
1497
1498         ## Here the calculation starts
1499         my $num_individuals = 0;
1500         my $sum = 0;
1501
1502      my $all_data_rows=0;
1503         foreach my $individual ( @{$self ->{'individuals'}} ) {
1504
1505           my $ifactors = $individual -> subject_data;
1506           my $individual_sum = 0;
1507           my $data_rows = 0;
1508           for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1509
1510             # data is stored in strings. We need to split them into an
1511             # array.
1512
1513             my @data_row = split( /,/, $ifactors -> [$i] );
1514             if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1515 #             print "Skipping row with missing data\n";
1516               next;
1517             }
1518
1519             if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1520 #             print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1521               next;
1522             }
1523
1524             if (defined $hi_cutoff) {
1525               if ($data_row[$column-1]>$hi_cutoff) {
1526                 $individual_sum += $data_row[$column-1]-$hi_cutoff;
1527               }
1528             }
1529             else {
1530               if (defined $low_cutoff) {
1531                 if ($data_row[$column-1]<$low_cutoff) {
1532                   $individual_sum += $low_cutoff - $data_row[$column-1];
1533                 }
1534               }
1535               else {
1536                 $individual_sum += $data_row[$column-1];
1537               }
1538             }
1539             $data_rows++;
1540           }
1541           if( $global_mean ) {
1542             $sum += $individual_sum;
1543             $num_individuals += $data_rows;
1544           } else {
1545             if( $data_rows != 0 ) {
1546               $sum += $individual_sum/$data_rows;
1547             }
1548             $num_individuals ++;
1549           }
1550           $all_data_rows += $data_rows;
1551         }
1552      if( $num_individuals != 0 ) {
1553        $return_value = $sum / $num_individuals;
1554      }
1555 #     print "DR: $all_data_rows\n";
1556 #     print "\nNIM: $num_individuals $return_value\n";
1557 }
1558
1559 end mean
1560
1561 # }}} mean
1562
1563 # {{{ sd
1564
1565 start sd
1566   {
1567     # This sub returns standard deviation for a specific column
1568     # If there are more than one sample/individual the value used for that specific
1569     # individual is the mean value of its samples.
1570     # The cut-offs are for hockey stick variables. I.e. If one individual value is
1571     # lower than the hi-cutoff the individual value will be zero.
1572     # HI_cutoff is used to calculate the HI-mean of a column.
1573     # If cut_off is undef it won't be used
1574     # See L</max>.
1575      my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1576      $self -> synchronize;
1577      my $first_id = $self -> {'individuals'}[0];
1578      debug -> die( message => "No individuals defined in data object based on ".
1579                    $self -> full_name ) unless defined $first_id;
1580
1581      my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1582
1583      unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1584        unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1585          debug -> die( message => "Unknown column: \"$column_head\" or "
1586                        ."invalid column number: \"$column\"" );
1587        } else {
1588          $column = $self -> {'column_head_indices'}{$column_head};
1589        }
1590      }
1591
1592      ## Here the calculation starts
1593      my $num_individuals = 0;
1594      my $sum = 0;
1595      my $mean;
1596      if (defined $hi_cutoff) {
1597        $mean = $self->mean(column   => $column,
1598                            hi_cutoff => $hi_cutoff,
1599                            global_mean => $global_sd );
1600      } elsif (defined $low_cutoff) {
1601        $mean = $self->mean(column   => $column,
1602                            low_cutoff => $low_cutoff,
1603                            global_mean => $global_sd );
1604      } else {
1605        $mean = $self->mean( column        => $column,
1606                             subset_column => $subset_column,
1607                             subset_syntax => $subset_syntax,
1608                             global_mean => $global_sd );
1609      }
1610
1611      foreach my $individual ( @{$self -> {'individuals'}} ) {
1612        my $ifactors = $individual -> subject_data;
1613        my $individual_sum = 0;
1614        my $data_rows = 0;
1615        for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1616
1617          # data is stored in strings. We need to split them into an
1618          # array.
1619
1620          my @data_row = split( /,/, $ifactors -> [$i] );
1621
1622          if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1623 #          print "Skipping row with missing data\n";
1624            next;
1625          }
1626
1627          if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1628 #          print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1629            next;
1630          }
1631
1632          if (defined $hi_cutoff) {
1633            if ($data_row[$column-1]>$hi_cutoff) {
1634              if( $global_sd ) {
1635                $individual_sum += ($data_row[$column-1] - $hi_cutoff - $mean) ** 2;
1636              } else {
1637                $individual_sum += $data_row[$column-1]-$hi_cutoff;
1638              }
1639            }
1640          } else {
1641            if (defined $low_cutoff) {
1642              if ($data_row[$column-1]<$low_cutoff) {
1643                if( $global_sd ) {
1644                  $individual_sum += ($low_cutoff - $data_row[$column-1] - $mean) ** 2;
1645                } else {
1646                  $individual_sum += $low_cutoff - $data_row[$column-1];
1647                }
1648              }
1649            } else {
1650              if( $global_sd ) {
1651                $individual_sum += ($data_row[$column-1] - $mean) ** 2;
1652              } else {
1653                $individual_sum += $data_row[$column-1];
1654              }
1655            }
1656          }
1657          $data_rows++;
1658        }
1659        if( $global_sd ) {
1660          $sum += $individual_sum;
1661          $num_individuals += $data_rows;
1662        } else {
1663          if( $data_rows != 0 ) {
1664            $sum += ($individual_sum/$data_rows - $mean) ** 2;
1665          }
1666          $num_individuals++;
1667        }
1668      }
1669      if( $num_individuals < 2 ) {
1670        $return_value = 0;
1671      } else {
1672        if( $num_individuals != 0 ) {
1673          $return_value = (1/($num_individuals-1)*$sum) ** 0.5;
1674        }
1675      }
1676    }
1677
1678 end sd
1679
1680 # }}} sd
1681
1682 # {{{ range
1683
1684 start range
1685       {
1686         # See L</max>.
1687         my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1688         if ( defined $self -> {'range'}[$tmp_column] ) {
1689           $return_value = $self -> {'range'}[$tmp_column];
1690         } else {
1691           my $old_target = $self -> {'target'};
1692           $self -> {'target'} = 'mem';
1693           $self -> synchronize;
1694           $return_value = $self -> max( column      => $column,
1695                                         column_head => $column_head ) -
1696                                           $self -> min( column      => $column,
1697                                                         column_head => $column_head );
1698           $self -> {'range'}[$column] = $return_value;
1699           if ( $old_target eq 'disk' ) {
1700             $self -> flush if ( $self -> {'target'} eq 'disk' );
1701             $self -> {'target'} = 'disk';
1702           }
1703         }
1704       }
1705 end range
1706
1707 # }}} range
1708
1709 # {{{ recalc_column
1710
1711 start recalc_column
1712       {
1713           # Recalculates a column based on expression. Also, see L</max>.
1714         $self -> synchronize;
1715
1716         # Check if $column(-index) is defined and valid, else try to find index using column_head
1717         my $first_id = $self -> {'individuals'}[0];
1718         die "data -> recalc_column: No individuals defined in data object based on ",
1719           $self -> full_name,"\n" unless defined $first_id;
1720
1721         my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1722
1723         unless ( defined $column  && defined( $data_row[$column-1] ) ) {
1724           if(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1725             die "Error in data -> recalc_column: unknown column: \"$column_head\" or column number: \"$column\"\n";
1726           } else {
1727             $column = $self -> {'column_head_indices'}{$column_head};
1728           }
1729         }
1730
1731         for my $individual ( @{$self -> {'individuals'}} ) {
1732           $individual -> recalc_column( column     => $column,
1733                                         expression => $expression );
1734         }
1735       }
1736 end recalc_column
1737
1738 # }}} recalc_column
1739
1740 # {{{ renumber_ascending
1741
1742 start renumber_ascending
1743       {
1744 # Renumbers the individuals (changes the subject identifiers) so that
1745 # all have unique integer numbers starting with start_at and
1746 # ascending. The primary use of this
1747 # method is not to order the individuals after their identifiers but to
1748 # ensure that all individuals have unique identifiers.
1749
1750         $self -> synchronize;
1751         foreach my $individual ( @{$self -> {'individuals'}} ) {
1752           $individual -> idnumber ( $start_at++ );
1753         }
1754         $self -> {'synced'} = 0;
1755       }
1756 end renumber_ascending
1757
1758 # }}} renumber_ascending
1759
1760 # {{{ renumber_descending
1761
1762 start renumber_descending
1763       {
1764           # See L</renumber_ascending>.
1765         $self -> synchronize;
1766         foreach my $individual ( @{$self -> {'individuals'}} ) {
1767           $individual -> idnumber ( $start_at-- );
1768         }
1769         $self -> {'synced'} = 0;
1770       }
1771 end renumber_descending
1772
1773 # }}} renumber_descending
1774
1775 # {{{ single_valued_data
1776
1777 start single_valued_data
1778       {
1779         # Usage:
1780         #
1781         # ($single_value_data_set, $remainder, $column_indexes) =
1782         #      $data_object -> single_valued_data( subset_name         => 'subset.dta',
1783         #                                          remainder_name      => 'remainder.dta',
1784         #                                          target              => 'disk',
1785         #                                          do_not_test_columns => [1..18,24,26];
1786         #
1787         # my $single_value_column_indexes = $column_indexes -> [0];
1788         # my $all_other_column_indexes    = $column_indexes -> [1];
1789         #
1790         # Analyses the content of each column, based on the
1791         # ID column, and returns two new data objects: One
1792         # that contains all columns that is has only one value per
1793         # individual and one that contains the
1794         # remainding data. This is useful for creating compact 'extra'
1795         # data sets that can be read in via user-defined sub-routines
1796         # when the number of columns needed exceeds the maximum that
1797         # NONMEM allows (e.g. 20 in NONMEM version V).
1798         #
1799         # The I<do_not_test_columns> argument specifies on which columns
1800         # to skip the single value test
1801
1802         my @multi_value_flags;
1803         my @individuals = @{$self -> {'individuals'}};
1804         # Initiate the flags:
1805         if ( defined $individuals[0] ) {
1806           my @data = @{$individuals[0] -> {'subject_data'}};
1807           my @data_row = split( /,/ , $data[0] );
1808           for ( my $i = 0; $i < scalar @data_row; $i++ ) {
1809             my $dnt_flag = 0;
1810             foreach my $dntc ( @do_not_test_columns ) {
1811               $dnt_flag = 1 if ( $i == $dntc - 1 );
1812             }
1813             $multi_value_flags[$i] = $dnt_flag;
1814           }
1815         } else {
1816           die "data -> single_valued_data: No data in ID number 1\n";
1817         }
1818         # Collect the stats
1819         for ( my $id = 0; $id <= $#individuals; $id++ ) {
1820           my @data = @{$individuals[$id] -> {'subject_data'}};
1821           my @data_row = split( /,/, $data[0] );
1822           for ( my $j = 0; $j < scalar @data_row; $j++ ) {
1823             my %col_unique;
1824             for ( my $i = 0; $i <= $#data; $i++ ) {
1825               my @data_row = split( /,/ , $data[$i] );
1826               $col_unique{$data_row[$j]}++;
1827             }
1828             my $factors = scalar keys %col_unique;
1829             $multi_value_flags[$j]++ if ( $factors > 1 );
1830           }
1831         }
1832         for ( my $i = 0; $i <= $#multi_value_flags; $i++ ) {
1833           if ( $multi_value_flags[$i] ) {
1834             push ( @{$column_indexes[1]}, $i + 1);
1835           } else {
1836             push ( @{$column_indexes[0]}, $i + 1);
1837           }
1838         }
1839         ( $single_value_data_set, $remainder ) =
1840           $self -> subset_vertically( column_indexes      => $column_indexes[0],
1841                                       subset_name         => $subset_name,
1842                                       return_remainder    => 1,
1843                                       remainder_name      => $remainder_name,
1844                                       target              => $target,
1845                                       keep_first_row_only => 1);
1846       }
1847 end single_valued_data
1848
1849 # }}}
1850
1851 # {{{ subset_vertically
1852
1853 start subset_vertically
1854       {
1855         # Usage:
1856         #
1857         # $subset = $data_object -> subset_vertically ( column_indexes => [1,2,6],
1858         #                                               subset_name    => 'subset.dta' );
1859         #
1860         # This basic usage returns a new data object containing
1861         # columns 1,2 and 6 from the original data plus the
1862         # idcolumn. The new data object will be associated with the
1863         # file 'subset.dta'.
1864         #
1865         # You get the remaining data, i.e. the original data minus
1866         # the created subset by specifying
1867         #
1868         # ( $subset, $remainder ) =
1869         #      $data_object -> subset_vertically ( column_indexes   => [1,2,6],
1870         #                                          subset_name      => 'subset.dta',
1871         #                                          return_remainder => 1,
1872         #                                          remainder_name   => 'remainder.dta' );
1873         #
1874         # If you would like to flush the created data sets to disk and
1875         # save memory, set the I<target> argument to 'disk'. The
1876         # default value 'mem' will keep the whole data object in RAM.
1877         #
1878         # The I<keep_first_row_only> argument can be used to reduce
1879         # the size of the subset data obejct by excluding all but the
1880         # first row of data from each individual.
1881         #
1882         my @individuals = @{$self -> {'individuals'}};
1883         # Create remainder index array if necessary
1884         my @remainder_indexes;
1885         if ( defined $individuals[0] ) {
1886           my @data = @{$individuals[0] -> {'subject_data'}};
1887           my $idcolumn = $individuals[0] -> {'idcolumn'};
1888 #         print "IC: $idcolumn\n";
1889           my $id_flag = 0;
1890           foreach my $use_index ( @column_indexes ) {
1891             $id_flag = 1 if ( $use_index == $idcolumn );
1892           }
1893           if ( $return_remainder ) {
1894 #           @remainder_indexes = ( $idcolumn );
1895             for ( my $i = 0; $i < scalar split(/,/,$data[0]); $i++ ) {
1896               my $rem_flag = 1;
1897               foreach my $use_index ( @column_indexes ) {
1898                 $rem_flag = 0 if ( $i == $use_index -1 );
1899 # or
1900 #                                  $i == $idcolumn -1 );
1901               }
1902               push( @remainder_indexes, $i + 1 ) if ( $rem_flag );
1903             }
1904             unshift( @remainder_indexes, $idcolumn ) if ( $id_flag );
1905           }
1906           unshift( @column_indexes, $idcolumn ) unless ( $id_flag );
1907         } else {
1908           die "data -> single_valued_data: No data in ID number 1\n";
1909         }
1910
1911         my @new_ids;
1912         my @new_ids_2;
1913         for ( my $id = 0; $id <= $#individuals; $id++ ) {
1914           my $idnumber = $individuals[$id] -> idnumber;
1915           my $idcolumn = $individuals[$id] -> idcolumn;
1916           my @data = @{$individuals[$id] -> {'subject_data'}};
1917           my @new_data;
1918           my @new_data_2;
1919           my $use_rows = $keep_first_row_only ? 0 : $#data;
1920           for ( my $i = 0; $i <= $use_rows; $i++ ) {
1921             my @new_row;
1922             my @data_row = split( /,/, $data[$i] );
1923             foreach my $use_index ( @column_indexes ) {
1924               push( @new_row, $data_row[$use_index-1] );
1925             }
1926 #           print "@new_row $#new_row\n";
1927             push( @new_data, join( ',', @new_row ) );
1928           }
1929           for ( my $i = 0; $i <= $#data; $i++ ) {
1930             if ( $return_remainder ) {
1931               my @new_row_2;
1932               my @data_row = split( /,/, $data[$i] );
1933               foreach my $use_index ( @remainder_indexes ) {
1934                 push( @new_row_2, $data_row[$use_index-1] );
1935               }
1936 #             print "@new_row_2 $#new_row_2\n";
1937               push( @new_data_2, join( ',' , @new_row_2 ) );
1938             }
1939           }
1940           my $new_id = data::individual -> new( idnumber     => $idnumber,
1941                                                 idcolumn     => $idcolumn,
1942                                                 subject_data => \@new_data );
1943           push( @new_ids, $new_id );
1944           if ( $return_remainder ) {
1945             my $new_id_2;
1946             $new_id_2 = data::individual -> new( idnumber     => $idnumber,
1947                                                  idcolumn     => $idcolumn,
1948                                                  subject_data => \@new_data_2 );
1949           push( @new_ids_2, $new_id_2 );
1950           }
1951         }
1952         my @header = @{$self -> {'header'}};
1953         my @new_header;
1954         foreach my $use_index ( @column_indexes ) {
1955           push( @new_header, @header[$use_index-1] );
1956         }
1957         my $comment;
1958         if( defined $self -> {'comment'} ){
1959           my @comment = @{$self -> {'comment'}};
1960           $comment = \@comment;
1961         }
1962         $subset = data -> new ( filename             => $subset_name,
1963                                 directory            => $self -> {'directory'},
1964                                 ignoresign           => $self -> {'ignoresign'},
1965                                 header               => \@new_header,
1966                                 comment              => $comment,
1967                                 individuals          => \@new_ids,
1968                                 target               => $target,
1969                                 ignore_missing_files => 1 );
1970         if ( $return_remainder ) {
1971           my @new_header_2;
1972           foreach my $use_index ( @remainder_indexes ) {
1973             push( @new_header_2, @header[$use_index-1] );
1974           }
1975           $remainder = data -> new ( filename             => $remainder_name,
1976                                      directory            => $self -> {'directory'},
1977                                      ignoresign           => $self -> {'ignoresign'},
1978                                      header               => \@new_header_2,
1979                                      comment              => $comment,
1980                                      individuals          => \@new_ids_2,
1981                                      target               => $target,
1982                                      ignore_missing_files => 1 );
1983         }
1984       }
1985 end subset_vertically
1986
1987 # }}}
1988
1989 # {{{ subsets
1990
1991 start subsets
1992       {
1993 #       if ( defined $expression and defined $bins ) {
1994 #         die "data -> subset: expression and bins may not both be specified\n";
1995 #       }
1996 #       if ( not ( defined $expression or defined $bins ) ) {
1997 #         die "data -> subset: expression or bins must be specified\n";
1998 #       }
1999         $self -> synchronize;
2000         my @header  = @{$self -> {'header'}};
2001         my @comment = defined $self -> {'comment'} ? @{$self -> {'comment'}} : ();
2002         my @subset_ids= ();
2003         my %rnd_ids;
2004         my $key = 0;
2005         my @ids = @{$self -> {'individuals'}};
2006         if ( defined $stratify_on ) {
2007           my $work_data = $self -> copy( filename => 'work_data.dta',
2008                                          target   => 'mem' );
2009           my %strata = %{$work_data -> factors( column => $stratify_on )};
2010 #         $Data::Dumper::Maxdepth = 1;
2011 #         print Dumper \%strata;
2012
2013           while ( my ( $factor, $keys ) = each %strata ) {
2014               foreach my $key ( @{$keys} ) {
2015                   my $rnd_num = rand;
2016                   while ( defined $rnd_ids{$factor}{$rnd_num} ) {
2017                       $rnd_num = rand;
2018                   }
2019                   $rnd_ids{$factor}{$rnd_num} = $ids[$key];
2020               }
2021           }
2022           my $first = 1;
2023           while ( my ( $factor, $rnd_nums ) = each %rnd_ids ) {
2024               my @sort_rnd_nums = sort { $a <=> $b } keys %{$rnd_nums};
2025               for ( my $i = 0; $i <= $#sort_rnd_nums; $i ) {
2026                   for ( my $j = 0; $j < $bins; $j++ ) {
2027                       if ( $first ) {
2028                           push( @subset_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy] );
2029                           push( @incl_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber] );
2030                       } else {
2031                           push( @{$subset_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy );
2032                           push( @{$incl_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber );
2033                       }
2034                       $i++;
2035                       last if $i > $#sort_rnd_nums;
2036                   }
2037                   $first = 0;
2038               }
2039           }
2040           for ( my $j = 0; $j < $bins; $j++ ) {
2041             my $sdata = data -> new ( header               => \@header,
2042                                       comment              => \@comment,
2043                                       ignoresign           => $self -> {'ignoresign'},
2044                                       individuals          => $subset_ids[$j],
2045                                       ignore_missing_files => 1,
2046                                       target               => 'disk',
2047                                       idcolumn             => $self -> {'idcolumn'},
2048                                       filename             => "subset_$j.dta" );
2049             #$sdata -> _write;
2050             push( @subsets, $sdata );
2051           }
2052         } else {
2053           for ( my $i = 0; $i <= $#ids; $i++ ) {
2054             my $rnd_num = rand;
2055             while ( defined $rnd_ids{$rnd_num} ) {
2056               $rnd_num = rand;
2057             }
2058             $rnd_ids{$rnd_num} = $ids[$i];
2059           }
2060           my @keys = sort { $a <=> $b } keys %rnd_ids;
2061           my $first = 1;
2062           for ( my $i = 0; $i <= $#keys; $i ) {
2063             for ( my $j = 0; $j < $bins; $j++ ) {
2064               if ( $first ) {
2065                 push( @subset_ids, [$rnd_ids{$keys[$i]} -> copy] );
2066                 push( @incl_ids, [$rnd_ids{$keys[$i]} -> idnumber] );
2067               } else {
2068                 push( @{$subset_ids[$j]}, $rnd_ids{$keys[$i]} -> copy );
2069                 push( @{$incl_ids[$j]}, $rnd_ids{$keys[$i]} -> idnumber );
2070               }
2071               $i++;
2072               last if $i > $#keys;
2073             }
2074             $first = 0;
2075           }
2076           for ( my $j = 0; $j < $bins; $j++ ) {
2077             my $sdata = data -> new ( header               => \@header,
2078                                       comment              => \@comment,
2079                                       ignoresign           => $self -> {'ignoresign'},
2080                                       individuals          => $subset_ids[$j],
2081                                       ignore_missing_files => 1,
2082                                       target               => $target,
2083                                       idcolumn             => $self -> {'idcolumn'},
2084                                       filename             => "subset_$j.dta" );
2085             #$sdata -> _write;
2086             push( @subsets, $sdata );
2087           }
2088         }
2089       }
2090 end subsets
2091
2092 # }}} subsets
2093
2094 # {{{ subset
2095
2096 start subset
2097       {
2098         $self -> synchronize;
2099         my @header  = @{$self -> {'header'}};
2100         my @comment = defined $self -> {'comment'} ? @{$self -> {'comment'}} : ();
2101         my @subset_inds = ();
2102         my $key = 0;
2103         foreach my $individual ( @{$self -> {'individuals'}} ) {
2104           if ( $individual -> evaluate_expression( column     => $based_on,
2105                                                    expression => $expression ) ) {
2106             push( @subset_inds, $individual -> copy );
2107             push( @incl_individuals, $individual -> idnumber );
2108             push( @included_keys, $key );
2109           }
2110           $key++;
2111         }
2112         $subset = data -> new ( header      => \@header,
2113                                 comment     => \@comment,
2114                                 ignoresign  => $self -> {'ignoresign'},
2115                                 individuals => \@subset_inds,
2116                                 idcolumn    => $self -> {'idcolumn'},
2117                                 filename    => "subset.dta" );
2118       }
2119 end subset
2120
2121 # }}} subset
2122
2123 # {{{ target
2124
2125 start target
2126       {
2127         if ( $parm eq 'disk' and $self -> {'target'} eq 'mem' ) {
2128           $self -> {'target'} = 'disk';
2129           $self -> flush;
2130         } elsif ( $parm eq 'mem' and $self -> {'target'} eq 'disk' ) {
2131           $self -> {'target'} = 'mem';
2132           $self -> synchronize;
2133         }
2134       }
2135 end target
2136
2137 # }}}
2138
2139 # {{{ _write
2140
2141 start _write
2142     {
2143       die "ERROR: data -> _write: No filename set in data object.\n"
2144           if( $filename eq '' );
2145
2146 #      $Data::Dumper::Maxdepth = 2;
2147 #      die Dumper $self -> {'individuals'};
2148
2149       if( not defined $self -> {'individuals'} ){
2150
2151         # If we don't have any individuals and write to a new
2152         # filename, we must first read individuals from the old
2153         # file. A call to synchronize will do that. There is no risk
2154         # of a infinite loop here since synchronize allways writes to
2155         # "full_name".
2156
2157         unless( $filename eq $self -> full_name ){
2158           $self -> synchronize;
2159         }
2160       }
2161
2162       open(FILE,">$filename") ||
2163           die "Could not create $filename\n";
2164       my $data_ref = $self -> format_data;
2165       my @data = @{$data_ref};
2166       for ( @data ) {
2167         print ( FILE );
2168       }
2169       close(FILE);
2170
2171 #       if ( $PsN::config -> {'_'} -> {'use_database'} and
2172 #          $self -> {'use_data_table'} ) {
2173 #       # Backslashes messes up the sql syntax
2174 #       my $file_str = $self->{'filename'};
2175 #       my $dir_str = $self->{'directory'};
2176 #       $file_str =~ s/\\/\//g;
2177 #       $dir_str =~ s/\\/\//g;
2178
2179 #       # md5sum
2180 #       my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
2181 #       my ( $date_str, $time_str );
2182 #       if ( $Config{osname} eq 'MSWin32' ) {
2183 #         $date_str = `date /T`;
2184 #         $time_str = ' '.`time /T`;
2185 #       } else {
2186 #         # Assuming UNIX
2187 #         $date_str = `date`;
2188 #       }
2189 #       chomp($date_str);
2190 #       chomp($time_str);
2191 #       my $date_time = $date_str.$time_str;
2192 #       my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2193 #                              ";databse=".$PsN::config -> {'_'} -> {'project'},
2194 #                                $PsN::config -> {'_'} -> {'user'},
2195 #                               $PsN::config -> {'_'} -> {'password'},
2196 #                                {
2197 #                                 'RaiseError' => 1});
2198 #       my $sth;
2199 #       if ( defined $self -> {'data_id'} ) {
2200 #         $sth = $dbh -> prepare( "UPDATE ".$PsN::config -> {'_'} -> {'project'}.
2201 #                              ".data ".
2202 #                                 "SET filename='$file_str',date='$date_time',".
2203 #                                 "directory='$dir_str',md5sum='$md5sum' ".
2204 #                                 "WHERE data_id='".$self -> {'data_id'}."'" );
2205 #         $sth -> execute or debug -> die( message => $sth->errstr ) ;
2206 #       } else {
2207 #         $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2208 #                              ".data (filename,date,directory,md5sum) ".
2209 #                                "VALUES ('$file_str', '$date_time', '$dir_str','".
2210 #                                $md5sum."' )");
2211 #         $sth -> execute;
2212 #         $self -> {'data_id'} = $sth->{'mysql_insertid'};
2213 #       }
2214 #       $sth -> finish;
2215 #       $dbh -> disconnect;
2216 #       }
2217     }
2218 end _write
2219
2220 # }}} _write
2221
2222 # {{{ flush
2223
2224 start flush
2225     {
2226       # synchronizes the object with the file on disk and empties
2227       # most of the objects attributes to save memory.
2228       if( defined $self -> {'individuals'} and
2229           ( !$self -> {'synced'} or $force ) ) {
2230         $self -> _write;
2231       }
2232 #      $self -> {'header'} = undef;
2233       $self -> {'comment'} = undef;
2234       $self -> {'individuals'} = undef;
2235       $self -> {'synced'} = 0;
2236 #      $self -> {'column_head_indices'} = undef; # kajsa comment 080211
2237       $self -> {'have_missing_data'} = undef;
2238     }
2239 end flush
2240
2241 # }}} flush
2242
2243 # {{{ synchronize
2244
2245 start synchronize
2246     {
2247       # synchronizes the object with the file on disk
2248       unless( $self -> {'synced'} ){
2249         if( defined $self -> {'individuals'} and
2250             scalar @{$self -> {'individuals'}} > 0 ){
2251           # We should not read new data from file if we
2252           # have an individuals defined?
2253           # Perhaps there should be an attribute
2254           # 'from_file' that overrides this and reads in
2255           # the data from the file specified in filename
2256           # and overwrites whatever the object already
2257           # contains?
2258 #           if( -e $self -> {'filename'} ){
2259 #             $self -> _read_header;
2260 #             $self -> _read_individuals;
2261 #           }
2262           $self -> _write;
2263         } else {
2264           if( -e $self -> full_name ){
2265             unless( defined $self -> {'header'} and scalar @{$self -> {'header'}} > 0 ){
2266               $self -> _read_header;
2267             }
2268             $self -> _read_individuals;
2269           } else {
2270             debug -> die( message => "Fatal error: datafile: " . $self -> full_name . " does not exist." );
2271             return;
2272           }
2273         }
2274       }
2275       my $i = 1;
2276       foreach my $head ( @{$self -> {'header'}} ){
2277         $self -> {'column_head_indices'} -> {$head} = $i;
2278         $i++;
2279       }
2280       $self -> {'synced'} = 1;
2281     }
2282 end synchronize
2283
2284 # }}} synchronize
2285
2286 # {{{ _fisher_yates_shuffle
2287
2288 start _fisher_yates_shuffle
2289       {
2290         my $arr_ref = $parm{'array'};
2291         debug -> warn( level   => 1,
2292                        message => "Array of zero length received" )
2293           if ( scalar @{$arr_ref} < 1 );
2294         my $i;
2295         for ($i = @$arr_ref; --$i; ) {
2296           my $j = random_uniform_integer(1,0,$i);
2297 #          my $j = int rand ($i+1);
2298 #         print "$j $j_new\n";
2299           @$arr_ref[$i,$j] = @$arr_ref[$j,$i];
2300         }
2301       }
2302 end _fisher_yates_shuffle
2303
2304 # }}} _fisher_yates_shuffle
2305
2306 # {{{ _read_header
2307
2308 start _read_header
2309       {
2310         my $filename   = $self -> full_name;
2311         my $ignoresign = $self -> ignoresign;
2312         my ( @data, @new_record, $row, $tmp_row, @header, $hdrstring );
2313
2314         open(DATAFILE,"$filename") ||
2315           die "Could not open $filename for reading";
2316         my $columns;
2317         while (<DATAFILE>) {
2318           s/\s*\,\s*/\,/g;
2319           $tmp_row    = $_;
2320           #     @new_record = split(/\,|\s+/,$_);
2321           if ( ! (/^\s*\d+|^\s*\./) ) {
2322             $data[$row] = $tmp_row;
2323             $row++;
2324           } else {
2325             # We have reached the first data-row, return.
2326             $columns = scalar split(/\,\s*|\s+/);
2327             last;
2328           }
2329         }
2330         close(DATAFILE);
2331
2332         if ( defined $self -> {'cont_column'} and not $self -> {'table_file'} ) {
2333           my $data_len = $#data;
2334           for ( my $i = $data_len; $i >= 0; $i-- ) {
2335             my @arr = split(/\,\s*|\s+/,$data[$i]);
2336             if ( $arr[$self -> {'cont_column'}-1] eq 'CONT' ) {
2337               my $start = $i == $data_len ? 0 : 1;
2338               for ( my $j = $start; $j <= $#arr; $j++ ) {
2339                 if ( $j != ($self -> {'cont_column'}-1) ) {
2340                   push( @header, $arr[$j] );
2341                 }
2342               }
2343               pop( @data );
2344             }
2345           }
2346           # the \Q and \E here are to escape wierd ignoresigns
2347           $header[0] =~ s/\Q$ignoresign\E//
2348               if ( defined $self->ignoresign );
2349           shift( @header ) if ( $header[0] eq "" );
2350         } else {
2351           chomp( $hdrstring = pop(@data));
2352           @header = split(/\,\s*|\s+/,$hdrstring);
2353           # the \Q and \E here are to escape wierd ignoresigns
2354           $header[0] =~ s/\Q$ignoresign\E//
2355               if ( defined $self->ignoresign );
2356           shift( @header ) if ( $header[0] eq "" );
2357           if( $self -> {'table_file'} ) {
2358             my @new_header;
2359             for( my $i = 1; $i <= scalar @header; $i++ ) {
2360               if( $header[$i-1] eq 'CONT' ) {
2361                 if ( defined $self -> {'cont_column'} and not $i == $self -> {'cont_column'} ) {
2362                   debug -> warn( level   => 1,
2363                                  message => "The supplied columns for the CONT data item (".
2364                                  $self -> {'cont_column'}.") does not match the column where the CONT ".
2365                                  "header was found ($i), using $i" );
2366                 }
2367                 $self -> {'cont_column'} = $i;
2368               } else {
2369                 push( @new_header, $header[$i-1] );
2370               }
2371             }
2372             @header = @new_header;
2373             for( my $i = 1; $i <= scalar @header; $i++ ) {
2374               if( $header[$i-1] eq 'ID' ) {
2375                 if ( defined $self -> {'idcolumn'} and not $i == $self -> {'idcolumn'} ) {
2376                   debug -> warn( level   => 1,
2377                                  message => "The supplied columns for the ID data item (".
2378                                  $self -> {'idcolumn'}.") does not match the column where the CONT ".
2379                                  "header was found ($i), using $i" );
2380                 }
2381                 $self -> {'idcolumn'} = $i;
2382               }
2383             }
2384           }
2385         }
2386
2387 # I'm not certain on how to deal with this conflict. I'm leaving it commented because I believe this code should not be here.
2388 #
2389 #<<<<<<< data_subs.pm
2390 #       $header[0] =~ s/$ignoresign//
2391 #         if ( defined $self->ignoresign );
2392 #       shift( @header ) if ( $header[0] eq "" );
2393 #=======
2394 #>>>>>>> 1.28
2395
2396 # It is ok with data sets without a header.
2397 #       unless( scalar @header > 0 ){ debug -> die( message => 'Datafile ' . $self -> full_name . ' is empty.' ); }
2398
2399         $self -> {'header'} = \@header;
2400         $self -> {'comment'} = \@data;
2401 #       if ( $PsN::config -> {'_'} -> {'use_database'} and
2402 #            $self -> {'use_data_table'} ) {
2403 #         my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2404 #                              ";databse=".$PsN::config -> {'_'} -> {'project'},
2405 #                                  $PsN::config -> {'_'} -> {'user'},
2406 #                               $PsN::config -> {'_'} -> {'password'},
2407 #                                  {'RaiseError' => 1});
2408 #         if ( scalar @header < 1 ) {
2409 #           for ( my $i = 1; $i <= $columns; $i++ ) {
2410 #             push( @header, $i );
2411 #           }
2412 #         }
2413 #         for ( my $i = 0; $i <= $#header; $i++ ) {
2414 #           my $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2415 #                              ".data_column ".
2416 #                                     "(name,number,data_id) ".
2417 #                                     "VALUES ('".$header[$i]."', '".($i+1).
2418 #                                     "', '".$self -> {'data_id'}."' )");
2419 #           $sth -> execute;
2420 #           push( @{$self -> {'data_column_ids'}}, $sth->{'mysql_insertid'} );
2421 #           $sth -> finish;
2422 #         }
2423 #         $dbh -> disconnect;
2424 #       }
2425       }
2426 end _read_header
2427
2428 # }}} _read_header
2429
2430 # {{{ _read_individuals
2431
2432 start _read_individuals
2433       {
2434         my $idcol        = $self -> idcolumn;
2435         my $filename     = $self -> full_name;
2436         #debug -> warn( level   => 1,
2437         #              message => "Building array of individuals from file " . $self -> {'filename'} );
2438         open(DATAFILE,"$filename") ||
2439           die "Could not open $filename for reading";
2440         my ( @new_row, $new_ID, $old_ID, @init_data );
2441         my $buffer;
2442         my $lines = 0;
2443         while (sysread DATAFILE, $buffer, 4096) {
2444           $lines += ($buffer =~ tr/\n//);
2445         }
2446         seek( DATAFILE, 0,0 );
2447
2448         # For status bar:
2449         my $status_bar = status_bar -> new( steps => $lines );
2450
2451         ui -> print( category => 'scm',
2452                      message  => "Reading data file: ".$self -> filename );
2453         ui -> print( category => 'scm',
2454                      message => $status_bar -> print_step(),
2455                      newline => 0);
2456
2457         my ( $sth, $dbh, $first_row_id, $first_value_id );
2458         my $insert = 1;
2459
2460         my $insert_rows;
2461         my $insert_values;
2462         my $row_counter = 0;
2463         my $individual_counter = 0;
2464         my $full_row;
2465       ROW: while ( <DATAFILE> ) {
2466           s/^ *//;
2467           s/\s*\,\s*/\,/g;
2468           my @new_row   = split(/\,\s*|\s+/);
2469           # This regexp check is not time consuming.
2470           if ( /^\s*\d+|^\s*\./ ) {
2471             if ( defined $self -> {'cont_column'} ) {
2472               if ( $new_row[$self -> {'cont_column'} - 1] == 1 ) {
2473                 if ( not $self -> {'table_file'} ) { # Skip the CONT=1 rows if this is a table file
2474                   for ( my $i = $#new_row; $i > 0; $i-- ) {
2475                     if ( $i != ($self -> {'cont_column'} - 1) ) {
2476                       unshift( @{$full_row}, $new_row[$i] );
2477                     }
2478                   }
2479                 }
2480                 next ROW;
2481               } else {
2482                 for ( my $i = $#new_row; $i >= 0; $i-- ) {
2483 #                 if ( $i != ($self -> {'cont_column'} - 1) or $self -> {'table_file'} ) {
2484                   if ( $i != ($self -> {'cont_column'} - 1) ) {
2485                     unshift( @{$full_row}, $new_row[$i] );
2486                   }
2487                 }
2488               }
2489             } else {
2490               @{$full_row} = @new_row;
2491             }
2492             $new_ID = $full_row -> [$idcol-1]; # index starts at 0
2493             $old_ID = $new_ID if ( not defined $old_ID );
2494
2495             #If we have not yet found first individual to read, then
2496             #count each time new individual found. If new individual is
2497             #first individual to read, then reset old_ID so that
2498             #the individual's lines will be read before new individual is pushed.
2499             #If we have not found first individual to read then skip to next line in file
2500             if ($skip_individuals > $individual_counter) {
2501                 #can only enter here if skip_individuals > 0 and have not started reading
2502                 if ( $new_ID != $old_ID ) {
2503                     $individual_counter++;
2504                 }
2505                 $old_ID = $new_ID;
2506                 unless ($skip_individuals == $individual_counter) {
2507                     #discard what has been read and skip to next line
2508                     $full_row = undef;
2509                     next ROW;
2510                 }
2511             }
2512
2513
2514             # Check if column miss data at some row (This adds about 30% of init time)
2515             my $mdt = $self -> {'missing_data_token'};
2516             for( my $i = 0; $i <= $#{$full_row}; $i++ ){
2517                 $self -> {'have_missing_data'} -> {$i+1} = 1
2518                   if( $full_row -> [$i] == $mdt ); # == is slower but safer than eq
2519             }
2520
2521             if ( $new_ID != $old_ID ) {
2522               my @subject_data = @init_data;
2523               my $id = data::individual -> new ( idcolumn     => $idcol,
2524                                                  subject_data => \@subject_data,
2525                                                  data_id      => $self -> {'data_id'} );
2526               push( @{$self -> {'individuals'}}, $id );
2527               #check if have stored max number individuals
2528               if ($max_individuals > 0) {
2529                   if (scalar(@{$self -> {'individuals'}}) == $max_individuals){
2530                       @init_data =(); #prevent any more rows from being stored after loop
2531                       last ROW;
2532                   }
2533               }
2534
2535               @init_data =(join( ",", @{$full_row}));
2536             } else {
2537               push( @init_data, join( ",", @{$full_row}) );
2538             }
2539             $old_ID = $new_ID;
2540             $full_row = undef;
2541           }
2542           if ( $status_bar -> tick() ) {
2543             ui -> print( category => 'scm',
2544                          message  => $status_bar -> print_step(),
2545                          wrap     => 0,
2546                          newline  => 0 );
2547           }
2548         }
2549
2550         # if we have ended loop because of max number individuals, init_data will be empty.
2551         if ( $#init_data >= 0 ) {
2552           push( @{$self -> {'individuals'}},
2553                 data::individual -> new ( idcolumn     => $idcol,
2554                                           subject_data => \@init_data ) );
2555         }
2556         ui -> print( category => 'scm',
2557                      message  => " ... done" );
2558         close(DATAFILE);
2559 #       $self -> _write( filename => 'test.dta' );
2560       }
2561 end _read_individuals
2562
2563 # }}} _read_individuals
2564
2565 # {{{ create_row_filter
2566
2567 start create_row_filter
2568 {
2569     unless ($self -> {'synced'}){
2570         'debug' -> die(message => "Cannot use this function with non-synced data.");
2571     }
2572
2573     my %index_hash=();
2574     my $index=0;
2575     my $keep;
2576     my $ind_counter=0;
2577
2578     my $count=scalar(@{$self -> {'individuals'}});
2579     if ($no_individuals < 1){
2580         'debug' -> die(message =>"Requested filter length must be at least 1 individual.");
2581     }
2582     if ($no_individuals > $count){
2583         'debug' -> die(message =>"Requested filter length $no_individuals individuals is larger ".
2584                        "than number of individuals in loaded dataset $count.");
2585     }
2586
2587
2588 #  if ( not $column =~ /^\d/ ) {
2589 #    $column = $self -> {'column_head_indices'} -> {$column} - 1;
2590 #  }
2591
2592     foreach my $column_name (@{$self -> {'header'}}){
2593         #could add if-statement here to allow different filter types.
2594         if ($column_name =~ /^(MDV)$/){
2595             if (exists $index_hash{$1}){
2596                 'debug' -> die(message =>"Found column header $1 twice, bailing out.");
2597             }
2598             $index_hash{$1}=$index;
2599         }
2600         $index++;
2601     }
2602
2603     #if no columns to filter on were found, return empty filter array
2604     #which means "keep everything". Otherwise enter loop below.
2605
2606     unless (scalar(keys %index_hash) < 1){
2607         foreach my $individual ( @{$self -> {'individuals'}} ) {
2608             foreach my $datarow ( @{$individual -> {'subject_data'}}){
2609                 $keep = 1;
2610                 my @row = split( /,/ , $datarow );
2611                 foreach my $key (keys %index_hash){
2612                     if ($row[$index_hash{$key}] != 0){
2613                         $keep = 0;
2614                         last;
2615                     }
2616                 }
2617                 push (@filter,$keep);
2618             }
2619             $ind_counter++;
2620             if ($ind_counter == $no_individuals){
2621                 last;
2622             }
2623         }
2624     }
2625
2626 }
2627 end create_row_filter
2628
2629 # }}} create_row_filter
2630
2631 # {{{ append_columns_to_matrix
2632
2633 start append_columns_to_matrix
2634 {
2635     my $row_count = scalar(@{$matrix}); #avreferera
2636     my ($column_count,$offset);
2637
2638     if ($row_count < 1){
2639         'debug' -> die(message => "Matrix cannot be empty in append_columns_to matrix.");
2640     }
2641     if (scalar(@columns)<1){
2642         'debug' -> die(message => "Column array cannot be empty in append_columns_to matrix.");
2643     }
2644     if (scalar(@columns)%$row_count > 0){
2645         'debug' -> die(message => "Column element count must be multiple of row count in matrix.");
2646     }
2647
2648     $column_count = scalar(@columns)/$row_count;
2649
2650     for (my $i=0; $i<$column_count; $i++){
2651         $offset=$i*$row_count;
2652         for (my $j=0; $j<$row_count; $j++){
2653             ${$matrix}[$j] .= ','. $columns[$offset+$j];
2654         }
2655     }
2656 }
2657 end append_columns_to_matrix
2658
2659 # }}} append_columns_to_matrix
2660
2661