moved nonpb.pm
[PsN.git] / lib / data_subs.pm
blob4329376243607f3e5227c92b9b86544cc85dc6d4
1 This file contains the code for the subroutines in
2 Perl-speaks-NONMEMs data module. It is not functional by itself.
3 The code should be transferred to the module autogenerated by dia2code
4 using the fill_diacode.pl script.
6 # {{{ include
8 start include statements
9 use Digest::MD5 'md5_hex';
10 use OSspecific;
11 use File::Copy "cp";
12 use Carp;
13 use Carp qw(cluck);
14 use Config;
15 use Math::Random;
16 use Storable;
17 use debug;
18 use ui;
19 use status_bar;
20 use Data::Dumper;
21 use Time::HiRes qw(gettimeofday);
22 my @primary_column_names = ('ID', 'DATE', 'DAT1', 'DAT2', 'DAT3' ,'L1', 'L2', 'DV', 'MDV', 'RAW_', 'MRG_', 'RPT_', 'TIME', 'DROP', 'SKIP', 'EVID', 'AMT', 'RATE', 'SS', 'II', 'ADDL', 'CMT', 'PCMT', 'CALL');
23 end include
25 # }}} include statements
27 # {{{ description
29 start description
30 # The structure of the data class is subject-centric, recognising that
31 # the subjects included in a study often can be regarded as
32 # independent. A class for the subject level exists within PsN and is
33 # called the individual class. A data object consists of at least one
34 # but probably many individual objects plus optional comments.
35 end description
37 # }}} description
39 # {{{ synopsis
41 start synopsis
42 # use data;
44 # my $data_obj = data -> new ( filename => 'test040314.dta' );
46 # $data_obj -> renumber_ascending;
48 # my $subsets_ref = $data_obj -> case_deletion( bins => 10 );
50 # my @subsets = @{$subsets_ref};
51 end synopsis
53 # }}} synopsis
55 # {{{ see_also
57 start see_also
58 # =begin html
60 # <a HREF="model.html">model</a>, <a HREF="output.html">output</a>,
61 # <a HREF="tool/modelfit.html">tool::modelfit</a>,
62 # <a HREF="tool.html">tool</a>
64 # =end html
66 # =begin man
68 # model, output, tool::modelfit, tool
70 # =end man
71 end see_also
73 # }}} see_also
75 # {{{ new
77 start new
79 # If the column holding the subject identifier is not the
80 # first, it can be specified using the I<idcolumn> attribute
82 # I<ignoresign> determines which rows that are regarded as
83 # comments. Corresponds to the IGNORE= option in the $DATA
84 # record in a NONMEM model file.
86 $this -> {'use_data_table'} = 0;
88 ( $this -> {'directory'},
89 $this -> {'filename'} ) = OSspecific::absolute_path( $this -> {'directory'},
90 $this->{'filename'} );
92 debug -> warn( level => 2,
93 message => "data -> new: Data object initialized from file: ".
94 $this -> full_name );
96 # sub register_in_database {
97 # my $this = shift;
98 # # Backslashes messes up the sql syntax
99 # my $file_str = $this->{'filename'};
100 # my $dir_str = $this->{'directory'};
101 # $file_str =~ s/\\/\//g;
102 # $dir_str =~ s/\\/\//g;
104 # # md5sum
105 # my $md5sum = md5_hex(OSspecific::slurp_file($this-> full_name ));
106 # my $dbh = DBI ->
107 # connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
108 # ";databse=".$PsN::config -> {'_'} -> {'project'},
109 # $PsN::config -> {'_'} -> {'user'},
110 # $PsN::config -> {'_'} -> {'password'},
111 # {'RaiseError' => 1});
112 # my $sth;
113 # my $sth = $dbh -> prepare( "SELECT data_id FROM ".$PsN::config -> {'_'} -> {'project'}.
114 # ".data ".
115 # "WHERE filename = '$file_str' AND ".
116 # "directory = '$dir_str' AND ".
117 # "md5sum = '".$md5sum."'" );
118 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
119 # my $select_arr = $sth -> fetchall_arrayref;
120 # if ( scalar @{$select_arr} > 0 ) {
121 # debug -> warn( level => 1,
122 # message => "Found an old entry in the database matching the ".
123 # "current data file" );
124 # if ( scalar @{$select_arr} > 1 ) {
125 # debug -> warn( level => 1,
126 # message => "Found more than one matching entry in database".
127 # ", using the first" );
129 # $this -> {'data_id'} = $select_arr->[0][0];
130 # } else {
131 # my ( $date_str, $time_str );
132 # if ( $Config{osname} eq 'MSWin32' ) {
133 # $date_str = `date /T`;
134 # $time_str = ' '.`time /T`;
135 # } else {
136 # # Assuming UNIX
137 # $date_str = `date`;
139 # chomp($date_str);
140 # chomp($time_str);
141 # my $date_time = $date_str.$time_str;
142 # $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
143 # ".data (filename,date,directory,md5sum) ".
144 # "VALUES ('$file_str', '$date_time', '$dir_str','".
145 # $md5sum."' )");
146 # $sth -> execute;
147 # $this -> {'data_id'} = $sth->{'mysql_insertid'};
149 # $sth -> finish;
150 # $dbh -> disconnect;
154 unless ( ( defined $this -> {'header'} and
155 scalar @{$this -> {'header'}} > 0 ) or
156 ( defined $this -> {'individuals'} and
157 scalar @{$this -> {'individuals'}} > 0 ) ) {
158 if ( -e $this -> full_name ) {
159 if ( $this -> {'target'} eq 'mem' ) {
160 # &register_in_database( $this ) if ( $PsN::config -> {'_'} -> {'use_database'} and
161 # $this -> {'use_data_table'} );
162 $this -> _read_header;
163 $this -> _read_individuals;
164 $this -> {'synced'} = 1;
165 } else {
166 $this -> {'synced'} = 0;
168 } else {
169 debug -> die(message => "No header, individuals, and no file " . $this -> full_name . " on disk.")
170 unless $this -> {'ignore_missing_files'};
171 $this -> {'synced'} = 0;
173 } else {
174 if ( $this -> {'target'} eq 'mem') {
175 if ( -e $this -> {'filename'} ) {
176 $this -> _read_header;
177 # &register_in_database if ( $PsN::config -> {'_'} -> {'use_database'} and
178 # $this -> {'use_data_table'} );
179 $this -> _read_individuals;
180 $this -> {'synced'} = 1;
181 } else {
182 debug -> die(message => "No file:".$this->{'filename'}." on disk" )
183 unless $this -> {'ignore_missing_files'};
184 $this -> {'synced'} = 0;
186 } else {
187 $this -> flush;
191 if ( $this -> {'synced'} ) {
192 my $i = 1;
193 foreach my $head ( @{$this -> {'header'}} ) {
194 $this -> {'column_head_indices'} -> {$head} = $i;
195 $i++;
198 # $Data::Dumper::Maxdepth = 3;
199 # die Dumper $this -> {'individuals'};
201 end new
203 # }}} new
205 # {{{ register_in_database
206 start register_in_database
207 if ( $PsN::config -> {'_'} -> {'use_database'} ) {
208 # Backslashes messes up the sql syntax
209 my $file_str = $self->{'filename'};
210 my $dir_str = $self->{'directory'};
211 $file_str =~ s/\\/\//g;
212 $dir_str =~ s/\\/\//g;
214 my $project = $PsN::config -> {'_'} -> {'project'};
215 # md5sum
216 my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
218 my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
219 ";databse=".$project,
220 $PsN::config -> {'_'} -> {'user'},
221 $PsN::config -> {'_'} -> {'password'},
222 {'RaiseError' => 1});
224 my $sth;
226 my $select_arr = [];
228 if ( not $force ) {
229 my $sth = $dbh -> prepare( "SELECT data_id FROM ".$project.
230 ".data ".
231 "WHERE filename = '$file_str' AND ".
232 "directory = '$dir_str' AND ".
233 "md5sum = '".$md5sum."'" );
234 $sth -> execute or debug -> die( message => $sth->errstr ) ;
235 $select_arr = $sth -> fetchall_arrayref;
238 if ( scalar @{$select_arr} > 0 ) {
239 'debug' -> warn( level => 1,
240 message => "Found an old entry in the database matching the ".
241 "current data file" );
242 if ( scalar @{$select_arr} > 1 ) {
243 'debug' -> warn( level => 1,
244 message => "Found more than one data matching entry in database".
245 ", using the first" );
247 $self -> {'data_id'} = $select_arr->[0][0];
248 # Find the id's
249 my $sth = $dbh -> prepare( "SELECT individual_id FROM ".$project.".data_individual ".
250 "WHERE data_id = '".$self -> {'data_id'}."'" );
251 $sth -> execute or debug -> die( message => $sth->errstr ) ;
252 my $id_arr = $sth -> fetchall_arrayref;
253 map( $_ = $_ -> [0], @{$id_arr} );
254 $self -> {'individual_ids'} = $id_arr;
255 } elsif ( defined $self -> {'individuals'} ) {
256 my ( $date_str, $time_str );
257 if( $Config{osname} eq 'MSWin32' ){
258 $date_str = `date /T`;
259 $time_str = ' '.`time /T`;
260 } else {
261 # Assuming UNIX
262 $date_str = `date`;
264 chomp($date_str);
265 chomp($time_str);
266 my $date_time = $date_str.$time_str;
267 my ( $columns, $values );
268 my $res_str = $resampled ? '1' : '0';
269 if ( defined $model_id ) {
270 $columns = '(model_id, filename, date, directory, md5sum, resampled)';
271 $values = "('$model_id', '$file_str', '$date_time', '$dir_str','".
272 $md5sum."', '$res_str' )";
273 } else {
274 $columns = '(filename, date, directory, md5sum, resampled)';
275 $values = "('$file_str', '$date_time', '$dir_str','".$md5sum."', '$res_str' )";
277 $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
278 ".data $columns VALUES $values");
279 $sth -> execute;
280 $self -> {'data_id'} = $sth->{'mysql_insertid'};
282 if ( defined $self -> {'data_id'} ) {
283 my $values;
284 my $columns = "( id_key, id )";
285 if( $#individual_ids >= 0 ) {
286 $self -> register_di_relation( individual_ids => \@individual_ids );
287 } else {
288 my $inds = scalar @{$self -> {'individuals'}};
289 $dbh -> do( "LOCK TABLES ".$PsN::config -> {'_'} -> {'project'}.
290 ".individual WRITE" );
291 # $sth = $dbh -> prepare( "SELECT MAX(individual_id)".
292 # " FROM ".$PsN::config -> {'_'} -> {'project'}.
293 # ".individual" );
294 $dbh -> do( 'USE '.$PsN::config -> {'_'} -> {'project'} );
295 $sth = $dbh -> prepare( "SHOW TABLE STATUS LIKE 'individual'" );
296 $sth -> execute or debug -> die( message => $sth->errstr ) ;
297 my $select_arr = $sth -> fetchall_arrayref;
298 my $first_id_id = $select_arr -> [0][10] ?
299 $select_arr -> [0][10] : 0;
300 # my $first_id_id = $select_arr -> [0][0] ? ($select_arr -> [0][0] + 1) : 0;
301 my $last_id_id = $first_id_id + $inds - 1;
302 for( my $i = 0; $i < $inds; $i++ ) {
303 if( defined $self -> {'individuals'}[$i] ) {
304 my $id_id = $self -> {'individuals'}[$i] -> idnumber;
305 $values = $values."," if ( defined $values );
306 $values = $values."( $i, $id_id )";
309 $sth = $dbh -> prepare( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
310 ".individual $columns VALUES $values" );
311 $sth -> execute;
312 $dbh -> do( "UNLOCK TABLES" );
313 @individual_ids = ($first_id_id .. $last_id_id);
314 $self -> register_di_relation( individual_ids => \@individual_ids );
316 $self -> {'individual_ids'} = \@individual_ids;
318 $sth -> finish;
319 $dbh -> disconnect;
321 $data_id = $self -> {'data_id'}; # return the data_id
323 end register_in_database
324 # }}} register_in_database
326 # {{{ register_di_relation
327 start register_di_relation
328 if ( $PsN::config -> {'_'} -> {'use_database'} and
329 defined $self -> {'data_id'} and $#individual_ids >= 0 ) {
330 my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
331 ";databse=".$PsN::config -> {'_'} -> {'project'},
332 $PsN::config -> {'_'} -> {'user'},
333 $PsN::config -> {'_'} -> {'password'},
334 {'raiseerror' => 1});
335 my $sth;
336 my $values;
337 my $columns = "( data_id, individual_id )";
338 foreach my $individual_id ( @individual_ids ) {
339 if ( defined $individual_id ) {
340 $values = $values."," if ( defined $values );
341 $values = $values."(".$self -> {'data_id'}.", $individual_id )";
344 $sth = $dbh -> prepare( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
345 ".data_individual $columns VALUES $values" );
346 $sth -> execute;
347 $sth -> finish if ( defined $sth );
348 $dbh -> disconnect;
350 end register_di_relation
351 # }}} register_di_relation
353 # {{{ full_name
355 start full_name
357 $full_name = $self -> {'directory'} . $self -> {'filename'};
359 end full_name
361 # }}}
363 # {{{ bootstrap
365 start bootstrap
367 # The bootstrap method draws I<samples> number of boostrap
368 # samples from the data set. The I<subjects> arguments
369 # determines the size of each sample (default equals to the
370 # number of individuals in the original data set). The method
371 # returns references to three arrays: I<boot_samples_ref>,
372 # which holds the bootstrap data sets, I<incl_individuals_ref>
373 # which holds arrays containing the subject identifiers (ID's)
374 # for the included individuals of each bootstrap data set and
375 # I<included_keys_ref> which holds the key or index of the
376 # included individuals. The key or index is an integer
377 # starting at 1 for the first individual in the original data
378 # set and increasing by one for each following.
379 $self -> synchronize;
380 my @header = @{$self -> {'header'}};
381 my $individuals = $self -> {'individuals'};
382 my $key_ref;
384 my $status_bar = status_bar -> new( steps => $samples );
385 ui -> print( category => 'bootstrap',
386 message => $status_bar -> print_step,
387 newline => 0);
389 for ( my $i = 1; $i <= $samples; $i++ ) {
390 my $new_name = defined $name_stub ? $name_stub."_$i.dta" : "bs$i.dta";
391 $new_name = $directory.'/'.$new_name;
392 my ( $boot, $incl_ind_ref, $incl_key_ref ) =
393 $self -> resample( subjects => \%subjects,
394 resume => $resume,
395 new_name => $new_name,
396 target => $target,
397 stratify_on => $stratify_on,
398 model_id => $model_ids[$i-1] );
399 push( @included_keys, $incl_key_ref );
400 push( @incl_individuals, $incl_ind_ref );
401 # $boot -> renumber_ascending;
402 push( @boot_samples, $boot );
403 # $boot -> synchronize;
404 # $boot -> flush;
405 if( $status_bar -> tick() ){
406 ui -> print( category => 'bootstrap',
407 message => $status_bar -> print_step,
408 newline => 0,
409 wrap => 0);
411 # print Dumper \@boot_samples;
412 # sleep(10);
414 ui -> print( category => 'bootstrap',
415 message => ' ... done' );
417 end bootstrap
419 # }}} bootstrap
421 # {{{ resample
423 start resample
425 $self -> synchronize;
426 my ( @header, $individuals, @bs_inds, $key_ref, @id_ids, @bs_id_ids );
427 @id_ids = @{$self -> {'individual_ids'}} if( defined $self -> {'individual_ids'} );
428 my @subj_keys = keys( %subjects );
429 if ( $#subj_keys < 0 ) {
430 debug -> die( message => "sample_size must be defined" );
432 if ( defined $stratify_on ) {
433 my %strata;
434 if( $stratify_on =~ /\D/ ){
435 %strata = %{$self -> factors( column_head => $stratify_on )};
436 if ( $strata{'Non-unique values found'} eq '1' ) {
437 debug -> die( message => "Individuals were found to have multiple values in the $stratify_on column. ".
438 "The column $stratify_on cannot be used for stratification of the resampling." );
440 } else {
441 %strata = %{$self -> factors( column => $stratify_on )};
442 if ( $strata{'Non-unique values found'} eq '1' ) {
443 debug -> die( message => "Individuals were found to have multiple values in column number $stratify_on. ".
444 "Column $stratify_on cannot be used for stratification of the resampling." );
447 if ( scalar keys( %subjects) != scalar keys( %strata ) and
448 not ( $#subj_keys == 0 and defined $subjects{'default'} ) ) {
449 debug -> die( message => "sample_size must be defined using one default value ".
450 "or exactly one value per strata:\n".
451 "resampling per STUD=1001,1002,1003\n".
452 "use -sample_size='1001=>10,1002=>25,1003=>12' or ".
453 "-sample_size='default=>10'");
455 unless ( $resume and -e $new_name ) {
456 @header = @{$self -> {'header'}};
457 $individuals = $self -> {'individuals'};
458 while( my ( $factor, $key_list ) = each %strata ) {
459 my $keys;
460 if ( defined $subjects{$factor} ) {
461 $keys = $subjects{$factor};
462 } elsif( defined $subjects{'default'} ) {
463 $keys = sprintf( "%.0f",($subjects{'default'}*
464 (scalar(@{$key_list}))/($self -> count_ind())) );
465 } else {
466 debug -> die( message => "A sample size for strata $factor could not be found ".
467 "and no default sample size was set" );
469 for ( my $i = 0; $i < $keys; $i++ ) {
470 my $list_ref = random_uniform_integer(1,0,(scalar(@{$key_list}) - 1));
471 push( @bs_inds, $individuals ->
472 [ $key_list -> [$list_ref] ] -> copy );
473 push( @included_keys, $key_list -> [$list_ref] );
474 push( @incl_individuals, $individuals ->
475 [ $key_list -> [$list_ref] ] -> idnumber );
476 push( @bs_id_ids, $id_ids[ $key_list -> [$list_ref] ] );
480 $boot = data -> new( header => \@header,
481 idcolumn => $self -> {'idcolumn'},
482 ignoresign => $self -> {'ignoresign'},
483 individuals => \@bs_inds,
484 filename => $new_name,
485 ignore_missing_files => 1,
486 target => 'mem' );
487 $boot -> renumber_ascending;
488 $boot -> _write;
489 $boot -> flush;
490 #$boot -> target( $target );
491 } else {
492 # If we are resuming, we still need to generate the
493 # pseudo-random sequence and initiate a data object
494 while( my ( $factor, $key_list ) = each %strata ) {
495 my $keys;
496 if ( defined $subjects{$factor} ) {
497 $keys = $subjects{$factor};
498 } elsif( defined $subjects{'default'} ) {
499 $keys = sprintf( "%.0f",($subjects{'default'}*
500 (scalar(@{$key_list}))/($self -> count_ind())) );
501 } else {
502 debug -> die( message => "A sample size for strata $factor could not be found ".
503 "and no default sample size was set" );
505 for ( my $i = 0; $i < $keys; $i++ ) {
506 my $list_ref = random_uniform_integer(1,0,(scalar(@{$key_list}) - 1));
509 $boot = data -> new( idcolumn => $self -> {'idcolumn'},
510 ignoresign => $self -> {'ignoresign'},
511 filename => $new_name,
512 ignore_missing_files => 1,
513 target => $target );
515 $boot -> _write;
516 $boot -> flush;
518 } else {
519 my $size;
520 if( defined $subjects{'default'} ) {
521 $size = $subjects{'default'};
522 } else {
523 debug -> die( message => "No default sample size was set" );
525 unless ( $resume and -e $new_name ) {
526 @header = @{$self -> {'header'}};
527 $individuals = $self -> {'individuals'};
528 for ( my $i = 1; $i <= $size; $i++ ) {
529 $key_ref = random_uniform_integer(1,0,scalar @{$individuals}-1);
530 push( @bs_inds, $individuals -> [ $key_ref ] -> copy );
531 push( @included_keys, $key_ref );
532 push( @incl_individuals, $individuals -> [ $key_ref ] -> idnumber );
533 push( @bs_id_ids, $id_ids[ $key_ref ] );
536 # MUST FIX: If a file already exists with the same name,
537 # the created bs data set will be appended to this. IT
538 # MUST BE OVERWRITTEN!
539 $boot = data -> new( header => \@header,
540 idcolumn => $self -> {'idcolumn'},
541 ignoresign => $self -> {'ignoresign'},
542 individuals => \@bs_inds,
543 filename => $new_name,
544 ignore_missing_files => 1,
545 target => 'mem' );
546 $boot -> renumber_ascending;
547 $boot -> _write;
548 $boot -> target( $target );
549 } else {
550 # If we are resuming, we still need to generate the
551 # pseudo-random sequence and initiate a data object
552 for ( my $i = 1; $i <= $size; $i++ ) {
553 random_uniform_integer(1,0,scalar @{$individuals}-1)
555 $boot = data -> new( idcolumn => $self -> {'idcolumn'},
556 ignoresign => $self -> {'ignoresign'},
557 filename => $new_name,
558 ignore_missing_files => 1,
559 target => $target );
561 $boot -> _write;
562 $boot -> flush;
564 if( $target eq 'disk'){
565 $boot -> flush;
568 $boot -> register_in_database( individual_ids => \@bs_id_ids,
569 resampled => 1,
570 model_id => $model_id );
572 end resample
574 # }}} resample
576 # {{{ case_deletion
578 start case_deletion
580 # case_deletion creates subsets of the data. The number of
581 # subsets is specified by the bins argument. The individuals
582 # of each subset is selected randomly or in ascending
583 # numerical order depending on the selection argument that can
584 # be either 'consecutive' or 'random'. case_column must be
585 # specified to give the method something to base the selection
586 # on. Valid case_column values are either the column number
587 # (pure digits) or the name of the column in the (optional)
588 # header row.
589 $self -> synchronize;
590 my @header = @{$self -> {'header'}};
591 if ( not defined $case_column ) {
592 debug -> die( message => "case_column must be specified" );
593 } else {
594 if ( not $case_column =~ /^\d/ ) {
595 for ( my $i = 0; $i <= $#header; $i++ ) {
596 $case_column = $i+1 if ( $header[$i] eq $case_column );
600 $bins = defined $bins ? $bins :
601 scalar keys %{$self -> factors( column => $case_column)};
602 my %factors = %{$self -> factors( column => $case_column )};
603 if ( $factors{'Non-unique values found'} eq '1' ) {
604 debug -> die( message => "Individuals were found to have multiple values in column number $case_column. ".
605 "Column $case_column cannot be used for case deletion." );
608 my $maxbins = scalar keys %factors;
609 my @ftrs = sort { $a <=> $b } keys %factors;
610 my $individuals = $self -> {'individuals'};
611 my $maxkey = scalar @{$individuals} - 1;
613 my ( @tmp_ftrs, @binsize ) =
614 ((),());
615 my ( $k, $j, $i ) = ( 0, 0, 0 );
616 # Create the binsizes
617 for ( $j = 0; $j < $maxbins; $j++ ) {
618 $binsize[ $k++ ]++;
619 $k = 0 if( $k >= $bins );
621 $self -> _fisher_yates_shuffle( array => \@ftrs ) if( $selection eq 'random' );
622 for ( $k = 0; $k < $bins; $k++ ) {
623 for ( $j = 0; $j < $binsize[ $k ]; $j++ ) {
624 # print "SK: ",$skipped_keys[ $k ]," F: ",$factors{ $ftrs[ $i ] },"\n";
625 push( @{$skipped_keys[ $k ]}, @{$factors{ $ftrs[ $i ] }} );
626 push( @{$skipped_values[ $k ]}, $ftrs[ $i++ ] );
630 for ( $k = 0; $k < $bins; $k++ ) {
631 my @cd_inds = ();
632 my @del_inds = ();
633 SELKEYS: foreach my $key ( 0..$maxkey ) {
634 foreach my $skipped ( @{$skipped_keys[ $k ]} ) {
635 if ( $key == $skipped ) {
636 push( @{$skipped_ids[ $k ]}, $individuals ->
637 [ $skipped ] -> idnumber );
638 push( @del_inds, $individuals -> [ $key ] -> copy );
639 next SELKEYS;
642 push( @cd_inds, $individuals -> [ $key ] -> copy );
644 # Set ignore_missing_files = 1 to make it possible to get the result
645 # in memory only
646 my $newdata = data ->
647 new ( header => \@header,
648 ignoresign => $self -> {'ignoresign'},
649 idcolumn => $self -> {'idcolumn'},
650 individuals => \@cd_inds,
651 target => $target,
652 filename => $directory.'/cdd_'.($k+1).'.dta',
653 ignore_missing_files => 1 );
654 my $deldata = data ->
655 new ( header => \@header,
656 ignoresign => $self -> {'ignoresign'},
657 idcolumn => $self -> {'idcolumn'},
658 individuals => \@del_inds,
659 target => $target,
660 filename => $directory.'/rem_'.($k+1).'.dta',
661 ignore_missing_files => 1 );
662 push( @subsets, $newdata );
663 push( @remainders, $deldata );
664 $newdata -> _write;
665 $newdata -> flush;
666 $deldata -> _write;
667 $deldata -> flush;
670 end case_deletion
672 # }}} case_deletion
674 # {{{ copy
675 start copy
677 # filename: new data file name.
679 # target: keep the copy in memory ('mem') or write it to disk and flush the memory ('disk').
681 ($directory, $filename) = OSspecific::absolute_path( $directory, $filename );
683 # Clone self into new data object. Why don't the individuals get cloned too?
684 # strange. need to set synced to 0 AND set the {'individuals'} to undef.
685 cp($self -> full_name, $directory.$filename );
686 $new_data = Storable::dclone( $self );
687 $new_data -> {'synced'} = 0;
688 $new_data -> {'individuals'} = undef;
689 $new_data -> synchronize;
691 # Set the new file name for the copy
692 $new_data -> directory( $directory );
693 $new_data -> filename( $filename );
695 end copy
697 # }}} copy
699 # {{{ column_to_array
701 start column_to_array
703 $self -> synchronize;
705 if ( not $column =~ /^\d/ ) {
706 $column = $self -> {'column_head_indices'} -> {$column} - 1;
709 if( $column < 0 or $column > $#{$self -> {'header'}} ){
710 return [];
713 #have separate loops for case without filter and with filter,
714 #to minimize risk of errors.
716 if (scalar(@filter)==0){
717 foreach my $individual ( @{$self -> individuals} ){
718 foreach my $individual_row( @{$individual -> subject_data} ){
719 my @row = split(/,/ , $individual_row);
720 push( @array, $row[$column] );
723 } else {
724 my $index=0;
725 foreach my $individual ( @{$self -> individuals} ){
726 foreach my $individual_row( @{$individual -> subject_data} ){
727 if ($filter[$index] > 0){
728 my @row = split(/,/ , $individual_row);
729 push( @array, $row[$column] );
731 $index++;
732 if ($index > $#filter){
733 $index=0;
737 unless ($index == 0){
738 'debug' -> die(message =>"Number of rows in dataset was not a multiple of ".
739 "filter length.");
744 end column_to_array
746 # }}}
748 # {{{ count_ind
750 start count_ind
752 # Returns the number of individuals in the data set.
753 $self -> synchronize;
754 if( defined $self -> individuals() ) {
755 $num = scalar @{$self -> individuals()};
756 } else {
757 debug -> die( message => "No individuals found in file ".
758 $self -> filename() );
761 end count_ind
763 # }}} count_ind
765 # {{{ diff
767 start diff
769 $self -> synchronize;
771 my $first_id = $self -> {'individuals'}[0];
773 debug -> die( message => "No individuals defined in data object based on ".
774 $self -> full_name ) unless ( defined $first_id );
776 # Check if $column(-index) is defined and valid, else try to find index
777 # using column_head
779 my @data_row = split( /,/, $first_id -> subject_data -> [0] );
780 if( $#columns >= 0 ) {
781 foreach my $column ( @columns ) {
782 unless ( defined $column && defined( $data_row[$column-1] ) ) {
783 debug -> die( message => "Error in data -> factors: ".
784 "invalid column number: \"$column\"\n".
785 "Valid column numbers are 1 to ".
786 scalar @{$first_id -> subject_data ->[0]}."\n" );
789 } elsif ( $#column_heads >= 0 ) {
790 foreach my $column_head ( @column_heads ) {
791 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
792 debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
793 "Valid column headers are (in no particular order):\n".
794 join(', ',keys(%{$self -> {'column_head_indices'}})) );
795 } else {
796 my $column = $self -> {'column_head_indices'}{$column_head};
797 push( @columns, $column );
798 debug -> warn( level => 2,
799 message => "$column_head is in column number $column" );
802 } else {
803 debug -> die( message => "No column or column_head defined" );
806 if( $global_largest or $global_smallest or
807 $largest_per_individual or $smallest_per_individual ) {
808 if( not scalar @{$self -> {'individuals'}} == scalar @{$against_data -> individuals} ) {
809 debug -> die( message => "Both data object must hold the same number of individuals ".
810 "and observations when calling data -> diff" );
812 for( my $i = 0; $i < scalar @{$self -> {'individuals'}}; $i++ ) {
813 my %id_diffs = %{$self -> {'individuals'}[$i] ->
814 diff( against_individual => $against_data -> individuals -> [$i],
815 columns => \@columns,
816 absolute_diff => $absolute_diff,
817 diff_as_fraction => $diff_as_fraction,
818 largest => ( $global_largest or $largest_per_individual ),
819 smallest => ( $global_smallest or $smallest_per_individual ) )};
820 if( $global_largest ) {
821 for( my $j = 0; $j <= $#columns; $j++ ) {
822 my $label = defined $column_heads[$j] ? $column_heads[$j] : $columns[$j];
823 if( not defined $diff_results{$label} or not defined $diff_results{$label}{'diff'} or
824 $id_diffs{$columns[$j]}{'diff'} > $diff_results{$label}{'diff'} ) {
825 $diff_results{$label}{'diff'} = $id_diffs{$columns[$j]}{'diff'};
826 $diff_results{$label}{'self'} = $id_diffs{$columns[$j]}{'self'};
827 $diff_results{$label}{'test'} = $id_diffs{$columns[$j]}{'test'};
832 } else {
833 die "data -> diff is only implemented for finding the largest difference at any observation at this point\n";
836 end diff
838 # }}} diff
840 # {{{ filename
841 start filename
843 if ( defined $parm and $parm ne $self -> {'filename'} ) {
844 $self -> {'filename'} = $parm;
845 $self -> {'data_id'} = undef;
846 # $self -> _write;
849 end filename
850 # }}} filename
852 # {{{ fractions
854 start fractions
856 my %factors = $self -> factors( 'return_occurences' => 1,
857 'unique_in_individual' => $unique_in_individual,
858 'column_head' => $column_head,
859 'column' => $column);
861 my $sum = 0;
862 while (my ($factor, $amount) = each %factors) {
863 if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
864 next;
865 } else {
866 $sum += $amount;
869 while (my ($factor, $amount) = each %factors) {
870 if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
871 next;
872 } else {
873 $fractions{$factor} = $amount/$sum;
877 end fractions
879 # }}} fractions
881 # {{{ factors
883 start factors
885 # Either column (number, starting at 1) or column_head must be specified.
887 # The default behaviour is to return a hash with the factors as keys
888 # and as values references to arrays with the order numbers (not the ID numbers)
889 # of the individuals that contain this factor
891 # If unique_in_individual is true (1), the returned hash will contain
892 # an element with key 'Non-unique values found' and value 1 if any
893 # individual contain more than one value in the specified column.
895 # Return occurences will calculate the occurence of each
896 # factor value. Several occurences in one individual counts as
897 # one occurence. The elements of the returned hash will have the factors
898 # as keys and the number of occurences as values.
901 $self -> synchronize;
903 # Check if $column(-index) is defined and valid, else try to find index
904 # using column_head
905 my $first_id = $self -> {'individuals'}[0];
907 debug -> die( message => "No individuals defined in data object based on ".
908 $self -> full_name ) unless ( defined $first_id );
910 my @data_row = split( /,/, $first_id -> subject_data -> [0] );
911 unless ( defined $column && defined( $data_row[$column-1] ) ) {
912 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
913 debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
914 "or invalid column number: \"$column\".\n".
915 "Valid column numbers are 1 to ".scalar @data_row ."\n".
916 "Valid column headers are (in no particular order):\n".
917 join(', ',keys(%{$self -> {'column_head_indices'}})) );
918 } else {
919 $column = $self -> {'column_head_indices'}{$column_head};
920 debug -> warn( level => 2,
921 message => "$column_head is in column number $column" );
925 my $key = 0;
926 foreach my $individual ( @{$self -> {'individuals'}} ) {
927 my @ifactors = keys %{$individual -> factors( column => $column )};
928 if ( scalar @ifactors > 1 and $unique_in_individual ) {
929 %factors = ( 'Non-unique values found' => 1 );
930 last;
932 debug -> die( message => "No value found in column $column in individual ".
933 $individual -> idnumber ) if ( scalar @ifactors == 0 );
935 # Return occurences will calculate the occurence of each
936 # factor value. Several occurences in one individual counts as
937 # one occurence.
939 if ( $return_occurences ) {
940 foreach my $ifactor ( @ifactors ) {
941 $factors{$ifactor}++;
943 } else {
944 foreach my $ifactor ( @ifactors ) {
945 push( @{$factors{$ifactor}}, $key );
948 $key++;
951 end factors
953 # }}} factors
955 # {{{ find_individual
957 # start find_individual
958 # foreach my $tmp_ind ( @{$self -> individuals} ) {
959 # if ( $tmp_ind -> key == $key ) {
960 # $individual = $tmp_ind;
961 # last;
964 # if ( defined $individual ) {
965 # if ( $copy ) {
966 # $individual = $individual -> copy;
968 # } else {
969 # print "No individual with key $key found in call to ".
970 # "data -> find_individual\n" if ( $self -> debug );
972 # end find_individual
974 # }}}
976 # {{{ format_data
978 start format_data
980 my $header = $self -> {'header'};
982 # format the data for NONMEM (simple comma-separated layout)
983 if ( defined $self -> {'comment'} ) {
984 my @comment = @{$self -> {'comment'}};
985 for ( @comment ) {
986 push( @form_data );
990 my $wrap = ( defined $self -> {'wrap_column'} and
991 defined $self -> {'cont_column'} );
993 my @primary_columns = defined $self -> {'primary_columns'} ?
994 @{$self -> {'primary_columns'}} : ();
995 my @secondary_columns = defined $self -> {'secondary_columns'} ?
996 @{$self -> {'secondary_columns'}} : ();
997 if ( defined $header and defined $self -> {'ignoresign'} ) {
998 my $istr;
999 if ( $self -> {'ignoresign'} ne '@' ) {
1000 $istr = $self -> {'ignoresign'};
1002 if ( $wrap ) {
1003 my @h_data;
1004 for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
1005 my $sstr = $istr;
1006 for ( my $j = 0; $j < scalar @{$secondary_columns[$i]} ; $j++ ) {
1007 my $jstr = $j == 0 ? '' : ',';
1008 $sstr = $sstr.$jstr.$secondary_columns[$i][$j][0];
1010 push( @h_data, $sstr."\n" );
1012 push( @form_data, @h_data );
1013 my $pstr = $istr;
1014 for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
1015 my $jstr = $i == 0 ? '' : ',';
1016 $pstr = $pstr.$jstr.$primary_columns[$i][0];
1018 push( @form_data, $pstr."\n" );
1019 } else {
1020 push( @form_data, $istr.join(',',@{$self -> {'header'}})."\n" );
1023 if ( $wrap ) {
1024 foreach my $individual ( @{$self -> {'individuals'}} ) {
1025 foreach my $row ( @{$individual -> subject_data} ) {
1026 my @r_data;
1027 for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
1028 my $sstr = '';
1029 for ( my $j = 0; $j < scalar @{$secondary_columns[$i]} ; $j++ ) {
1030 my $jstr = $j == 0 ? '' : ',';
1031 if ( $secondary_columns[$i][$j][0] eq 'CONT' ) {
1032 $sstr = $sstr.$jstr.'1';
1033 } else {
1034 my @data_row = split( /,/, $row );
1035 $sstr = $sstr.$jstr.$data_row[$secondary_columns[$i][$j][1]];
1038 push( @r_data, $sstr."\n" );
1040 push( @form_data, @r_data );
1041 my $pstr = '';
1042 for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
1043 my $jstr = $i == 0 ? '' : ',';
1044 if ( $primary_columns[$i][0] eq 'CONT' ) {
1045 $pstr = $pstr.$jstr.'0';
1046 } else {
1047 my @data_row = split( /,/, $row );
1048 $pstr = $pstr.$jstr.$data_row[$primary_columns[$i][1]];
1051 push( @form_data, $pstr."\n" );
1054 } else {
1055 foreach my $individual ( @{$self -> {'individuals'}} ) {
1056 foreach my $row ( @{$individual -> subject_data} ) {
1057 push( @form_data, $row ."\n" );
1062 end format_data
1064 # }}} format_data
1066 # {{{ drop_dropped
1068 start drop_dropped
1070 # This method removes columns that has '=DROP' value in the
1071 # model header as given by $INPUT. The model header must be
1072 # transfered to this method through the model_header
1073 # argument. The model_header argument should be a
1074 # two-dimensional array where each position in the first
1075 # dimension should be a reference to a 1*2 array holding the
1076 # column name and value. Any ignore-sign must be removed.
1078 debug -> die( message => 'model header must be defined' )
1079 if ( $#model_header < 0 );
1080 # Important that the drop_dropped method of the model::problem
1081 # class is in sync with this method.
1082 $self -> synchronize;
1084 $self -> {'header'} = [];
1085 my @drop;
1086 my $counter = 1;
1087 for( my $i = 0; $i <= $#model_header; $i++ ) {
1088 $self -> {'idcolumn'} = $counter if ( $model_header[$i][0] eq 'ID' );
1089 if( ( $model_header[$i][1] eq 'DROP' or
1090 $model_header[$i][1] eq 'SKIP' ) and
1091 not $model_header[$i][0] =~ /DAT(E|1|2|3)/ ) {
1092 push( @drop, 1 );
1093 } else {
1094 $counter++;
1095 push( @drop, 0 );
1096 push( @{$self -> {'header'}}, $model_header[$i][0] );
1100 foreach my $individual ( @{$self -> {'individuals'}} ) {
1101 $individual -> drop_columns( drop => \@drop );
1104 $self -> {'synced'} = 0;
1105 # $Data::Dumper::Maxdepth = 2;
1106 # die Dumper $self;
1107 # die Dumper $self -> {'individuals'};
1109 end drop_dropped
1111 # }}} drop_dropped
1113 # {{{ wrap
1114 start wrap
1116 $self -> synchronize;
1117 $self -> cont_column( $cont_column ) if ( defined $cont_column );
1118 $self -> wrap_column( $wrap_column ) if ( defined $wrap_column );
1119 $self -> prepare_wrap( model_header => \@model_header );
1120 @secondary_columns = @{$self -> {'secondary_columns'}}
1121 if ( defined $self -> {'secondary_columns'} );
1122 @primary_columns = @{$self -> {'primary_columns'}}
1123 if ( defined $self -> {'primary_columns'} );
1124 $self -> synced(0);
1126 end wrap
1127 # }}} wrap
1129 # {{{ unwrap
1130 start unwrap
1132 $self -> {'cont_column'} = undef;
1133 $self -> {'wrap_column'} = undef;
1134 $self -> {'secondary_columns'} = undef;
1135 $self -> {'primary_columns'} = undef;
1137 end unwrap
1138 # }}} unwrap
1140 # {{{ prepare_wrap
1142 start prepare_wrap
1144 my $cont_column = $self -> {'cont_column'};
1145 my $wrap_column = $self -> {'wrap_column'};
1146 debug -> die( message => 'cont_column ('.$cont_column.') must be less or equal '.
1147 'to the requested number of columns in each row ('.
1148 ($wrap_column).')' )
1149 if ( $cont_column > $wrap_column );
1150 my @header;
1151 if ( scalar @model_header > 0 ) {
1152 @header = @model_header;
1153 } else {
1154 @header = @{$self -> {'header'}};
1157 my ( @primary, @secondary, @date_columns );
1159 for ( my $i = 0; $i <= $#header; $i++ ) {
1160 my $name = ref( $header[$i] ) eq 'ARRAY' ? $header[$i][0] : $header[$i];
1161 my $value = ref( $header[$i] ) eq 'ARRAY' ? $header[$i][1] : undef;
1162 next if ( $name eq 'ID' );
1163 my $found = 0;
1164 foreach my $prim ( @primary_column_names ) {
1165 if ( not $found and
1166 ( $name eq $prim or $value eq $prim ) ) {
1167 push( @primary, [$name, $i, $value] );
1168 $found = 1;
1169 my $col = ($#primary+2)>= $cont_column ? ($#primary+3) : ($#primary+2);
1170 push( @date_columns, $col ) if ( $name =~ /DAT(E|1|2|3)/ );
1173 push( @secondary, [$name, $i, $value] ) if ( not $found );
1176 my $prim_num = scalar @primary;
1177 debug -> die( message => 'The number of primary columns (that need to '.
1178 'be part of the row with CONT=0) ('.($prim_num+1).
1179 ') is larger than the required number of columns (wrap_column='.
1180 $wrap_column.') - 1' )
1181 if ( scalar $prim_num > ($wrap_column-2) );
1183 my ( $i, $dum ) = ( 0, 1 );
1184 my @tmp;
1185 for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1186 if( $j == 1 ) {
1187 push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1188 } elsif ( $j == $wrap_column ) {
1189 if ( $j == $cont_column ) {
1190 push( @tmp, ['CONT', undef] );
1191 } else {
1192 my $val;
1193 if ( defined $primary[$i] ) {
1194 $val = $primary[$i];
1195 } elsif ( defined $secondary[0] ) {
1196 $val = shift(@secondary);
1197 } else {
1198 $val = ['XX'.$dum++,$self -> {'idcolumn'}-1];
1200 push( @tmp, $val );
1201 $i++;
1203 push( @{$self -> {'primary_columns'}}, @tmp );
1204 } else {
1205 if ( $j == $cont_column ) {
1206 push( @tmp, ['CONT', undef] );
1207 } else {
1208 if ( $i <= $#primary ) {
1209 push( @tmp, $primary[$i] );
1210 $i++;
1211 } else {
1212 my $val = defined $secondary[0] ? shift(@secondary) :
1213 ['XX'.$dum++,$self -> {'idcolumn'}-1];
1214 push( @tmp, $val );
1220 my $i = 0;
1221 while ( $i <= $#secondary ) {
1222 my @tmp;
1223 for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1224 if( $j == 1 ) {
1225 push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1226 } elsif ( $j == $wrap_column ) {
1227 if ( $j == $cont_column ) {
1228 push( @tmp, ['CONT', undef] );
1229 } else {
1230 my $val = defined $secondary[$i] ? $secondary[$i] :
1231 ['XX'.$dum++,$self -> {'idcolumn'}-1];
1232 push( @tmp, $val );
1233 $i++;
1235 unshift( @{$self -> {'secondary_columns'}}, \@tmp );
1236 } else {
1237 if ( $j == $cont_column ) {
1238 push( @tmp, ['CONT', undef] );
1239 } else {
1240 my $isdate = 0;
1241 if ( $#date_columns >= 0 ) {
1242 foreach my $col ( @date_columns ) {
1243 # This is a date column which may have to be dropped
1244 # and thus will not appear as a secondary
1245 # column. Nothing should be pushed. The indexes in
1246 # model::problem::pk::_format_record will be ok.
1247 $isdate = 1 if ( $col == $j ) ;
1250 if ( $isdate ) {
1251 push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1252 } else {
1253 if ( $i <= $#secondary ) {
1254 push( @tmp, $secondary[$i] );
1255 $i++;
1256 } else {
1257 push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1265 end prepare_wrap
1267 # }}} prepare_wrap
1269 # {{{ have_missing_data
1271 start have_missing_data
1273 # Either I<column> or I<column_head> must be specified.
1275 # This method looks through the data column with index I<column> or
1276 # (optional) header name I<column_head> and returns O if no missing
1277 # data indicator was found or 1 otherwise.
1279 $self -> synchronize;
1280 my $first_id = $self -> {'individuals'}[0];
1281 debug -> die( message => "No individuals defined in data object based on ".
1282 $self -> full_name ) unless ( defined $first_id );
1283 my @data_row = split( /,/ , $first_id -> subject_data -> [0] );
1284 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1285 unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1286 die "Error in data -> have_missing_data: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1287 } else {
1288 $column = $self -> {'column_head_indices'}{$column_head};
1291 $self -> flush if ( $self -> {'target'} eq 'disk' );
1293 # In case anyone wonders, the ternary statment ( bool ? true :
1294 # false ) below will possibly make a minuscle memory
1295 # optimization. But hey, why not :)
1297 $return_value = defined $self -> {'have_missing_data'} ? $self -> {'have_missing_data'} -> {$column} : 0;
1299 end have_missing_data
1301 # }}} have_missing_data
1303 # {{{ merge
1304 start merge
1306 #$self -> synchronize;
1307 push( @{$self -> {'individuals'}}, @{$mergeobj -> individuals} );
1309 end merge
1310 # }}} merge
1312 # {{{ max
1314 start max
1316 # Either column or column_head must be specified. Column_head must be a string that
1317 # identifies a column in the (optional ) data file header.
1319 # The if-statement below used to be a cache of allready calculated
1320 # means. But since individuals can be accessed in so many ways, we
1321 # don't know when this cache should be updated. Its easier to
1322 # recalculate the max. Maybe we can include this optimization in the
1323 # future, if it turns out to be a bottleneck
1324 # my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1325 # if ( defined $self -> {'max'}[$tmp_column] ) {
1326 # $return_value = $self -> {'max'}[$tmp_column] ;
1327 # } else {
1328 $self -> synchronize;
1329 my $first_id = $self -> {'individuals'}[0];
1330 debug -> die( message => "data -> max: No individuals defined in data object based on " .
1331 $self -> full_name ) unless defined $first_id;
1333 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1335 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1336 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1337 die "Error in data -> max: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1338 } else {
1339 $column = $self -> {'column_head_indices'}{$column_head};
1342 foreach my $individual ( @{$self -> {'individuals'}} ) {
1343 my $ifactors = $individual -> factors( 'column' => $column );
1344 foreach ( keys %{$ifactors} ) {
1345 next if ( $_ == $self -> {'missing_data_token'} );
1346 if ( defined ($return_value) ) {
1347 $return_value = $_ > $return_value ? $_ : $return_value;
1348 } else {
1349 $return_value = $_;
1354 # $self -> {'max'}[$column] = $return_value;
1355 $self -> flush if ( $self -> {'target'} eq 'disk' );
1358 end max
1360 # }}} max
1362 # {{{ min
1364 start min
1366 # See L</max>.
1367 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1369 # The if-statement below used to be a cache of allready calculated
1370 # means. But since individuals can be accessed in so many ways, we
1371 # don't know when this cache should be updated. Its easier to
1372 # recalculate the min. Maybe we can include this optimization in the
1373 # future, if it turns out to be a bottleneck
1374 # if ( defined $self -> {'min'}[$tmp_column] ) {
1375 # $return_value = $self -> {'min'}[$tmp_column] ;
1376 # } else {
1377 $self -> synchronize;
1378 my $first_id = $self -> {'individuals'}[0];
1379 die "data -> min: No individuals defined in data object based on ",
1380 $self -> full_name,"\n" unless defined $first_id;
1382 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1384 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1385 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1386 die "Error in data -> min: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1387 } else {
1388 $column = $self -> {'column_head_indices'}{$column_head};
1391 foreach my $individual ( @{$self -> {'individuals'}} ) {
1392 my $ifactors = $individual -> factors( 'column' => $column );
1393 foreach ( keys %{$ifactors} ) {
1394 next if ( $_ == $self -> {'missing_data_token'} );
1395 if ( defined ($return_value) ) {
1396 $return_value = $_ < $return_value ? $_ : $return_value;
1397 } else {
1398 $return_value = $_;
1402 # $self -> {'min'}[$column] = $return_value;
1403 $self -> flush if ( $self -> {'target'} eq 'disk' );
1406 end min
1408 # }}} min
1410 # {{{ median
1412 start median
1414 # See L</max>.
1415 $self -> synchronize;
1416 my $first_id = $self -> {'individuals'}[0];
1417 die "data -> median: No individuals defined in data object based on ",
1418 $self -> full_name,"\n" unless defined $first_id;
1420 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1422 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1423 unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1424 die "Error in data -> median: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1425 } else {
1426 $column = $self -> {'column_head_indices'}{$column_head};
1430 if( defined $self -> {'median'}[$column] ){
1431 return $self -> {'median'}[$column];
1434 my @median_array;
1436 foreach my $individual ( @{$self -> {'individuals'}} ) {
1437 if( $unique_in_individual ){
1438 my $ifactors = $individual -> factors( 'column' => $column );
1440 foreach ( keys %{$ifactors} ) {
1441 next if ( $_ == $self -> {'missing_data_token'} );
1442 push( @median_array, $_ );
1444 } else {
1445 my $ifactors = $individual -> subject_data;
1447 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1448 my @data_row = split( /,/ , $ifactors -> [$i] );
1449 next if ( $data_row[$column-1] == $self -> {'missing_data_token'} );
1450 push(@median_array, $data_row[$column-1]);
1454 @median_array = sort {$a <=> $b} @median_array ;
1456 if( @median_array % 2 ){
1457 $return_value = $median_array[$#median_array / 2];
1458 } else {
1459 $return_value = ( $median_array[@median_array / 2] +
1460 $median_array[(@median_array - 2) / 2] ) / 2;
1463 $self -> {'median'}[$column] = $return_value;
1465 end median
1467 # }}} median
1469 # {{{ mean
1471 start mean
1473 # Returns mean value of a column
1474 # If a individual contains more then 1 value (i.e. if an
1475 # individual has different values in different samples a mean
1476 # value of all individuals if calculate first, then the mean
1477 # value of the column If hi_cutoff is defined the mean function
1478 # will cut all value below the cutoff, and set their value to
1479 # 0. It's used to calculate the HI-mean/LOW-mean of a column for
1480 # e.g. Hockey-stick covariates If both hi_cutoff and low_cutoff
1481 # are defined only the hi_cutoff will be used. See L</max>.
1482 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1483 $self -> synchronize;
1484 my $first_id = $self -> {'individuals'}[0];
1485 die "data -> mean: No individuals defined in data object based on ",
1486 $self -> full_name,"\n" unless defined $first_id;
1488 my @data_row = split( /,/, $first_id -> subject_data ->[0] );
1490 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1491 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1492 die "Error in data -> mean: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1493 } else {
1494 $column = $self -> {'column_head_indices'}{$column_head};
1498 ## Here the calculation starts
1499 my $num_individuals = 0;
1500 my $sum = 0;
1502 my $all_data_rows=0;
1503 foreach my $individual ( @{$self ->{'individuals'}} ) {
1505 my $ifactors = $individual -> subject_data;
1506 my $individual_sum = 0;
1507 my $data_rows = 0;
1508 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1510 # data is stored in strings. We need to split them into an
1511 # array.
1513 my @data_row = split( /,/, $ifactors -> [$i] );
1514 if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1515 # print "Skipping row with missing data\n";
1516 next;
1519 if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1520 # print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1521 next;
1524 if (defined $hi_cutoff) {
1525 if ($data_row[$column-1]>$hi_cutoff) {
1526 $individual_sum += $data_row[$column-1]-$hi_cutoff;
1529 else {
1530 if (defined $low_cutoff) {
1531 if ($data_row[$column-1]<$low_cutoff) {
1532 $individual_sum += $low_cutoff - $data_row[$column-1];
1535 else {
1536 $individual_sum += $data_row[$column-1];
1539 $data_rows++;
1541 if( $global_mean ) {
1542 $sum += $individual_sum;
1543 $num_individuals += $data_rows;
1544 } else {
1545 if( $data_rows != 0 ) {
1546 $sum += $individual_sum/$data_rows;
1548 $num_individuals ++;
1550 $all_data_rows += $data_rows;
1552 if( $num_individuals != 0 ) {
1553 $return_value = $sum / $num_individuals;
1555 # print "DR: $all_data_rows\n";
1556 # print "\nNIM: $num_individuals $return_value\n";
1559 end mean
1561 # }}} mean
1563 # {{{ sd
1565 start sd
1567 # This sub returns standard deviation for a specific column
1568 # If there are more than one sample/individual the value used for that specific
1569 # individual is the mean value of its samples.
1570 # The cut-offs are for hockey stick variables. I.e. If one individual value is
1571 # lower than the hi-cutoff the individual value will be zero.
1572 # HI_cutoff is used to calculate the HI-mean of a column.
1573 # If cut_off is undef it won't be used
1574 # See L</max>.
1575 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1576 $self -> synchronize;
1577 my $first_id = $self -> {'individuals'}[0];
1578 debug -> die( message => "No individuals defined in data object based on ".
1579 $self -> full_name ) unless defined $first_id;
1581 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1583 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1584 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1585 debug -> die( message => "Unknown column: \"$column_head\" or "
1586 ."invalid column number: \"$column\"" );
1587 } else {
1588 $column = $self -> {'column_head_indices'}{$column_head};
1592 ## Here the calculation starts
1593 my $num_individuals = 0;
1594 my $sum = 0;
1595 my $mean;
1596 if (defined $hi_cutoff) {
1597 $mean = $self->mean(column => $column,
1598 hi_cutoff => $hi_cutoff,
1599 global_mean => $global_sd );
1600 } elsif (defined $low_cutoff) {
1601 $mean = $self->mean(column => $column,
1602 low_cutoff => $low_cutoff,
1603 global_mean => $global_sd );
1604 } else {
1605 $mean = $self->mean( column => $column,
1606 subset_column => $subset_column,
1607 subset_syntax => $subset_syntax,
1608 global_mean => $global_sd );
1611 foreach my $individual ( @{$self -> {'individuals'}} ) {
1612 my $ifactors = $individual -> subject_data;
1613 my $individual_sum = 0;
1614 my $data_rows = 0;
1615 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1617 # data is stored in strings. We need to split them into an
1618 # array.
1620 my @data_row = split( /,/, $ifactors -> [$i] );
1622 if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1623 # print "Skipping row with missing data\n";
1624 next;
1627 if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1628 # print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1629 next;
1632 if (defined $hi_cutoff) {
1633 if ($data_row[$column-1]>$hi_cutoff) {
1634 if( $global_sd ) {
1635 $individual_sum += ($data_row[$column-1] - $hi_cutoff - $mean) ** 2;
1636 } else {
1637 $individual_sum += $data_row[$column-1]-$hi_cutoff;
1640 } else {
1641 if (defined $low_cutoff) {
1642 if ($data_row[$column-1]<$low_cutoff) {
1643 if( $global_sd ) {
1644 $individual_sum += ($low_cutoff - $data_row[$column-1] - $mean) ** 2;
1645 } else {
1646 $individual_sum += $low_cutoff - $data_row[$column-1];
1649 } else {
1650 if( $global_sd ) {
1651 $individual_sum += ($data_row[$column-1] - $mean) ** 2;
1652 } else {
1653 $individual_sum += $data_row[$column-1];
1657 $data_rows++;
1659 if( $global_sd ) {
1660 $sum += $individual_sum;
1661 $num_individuals += $data_rows;
1662 } else {
1663 if( $data_rows != 0 ) {
1664 $sum += ($individual_sum/$data_rows - $mean) ** 2;
1666 $num_individuals++;
1669 if( $num_individuals < 2 ) {
1670 $return_value = 0;
1671 } else {
1672 if( $num_individuals != 0 ) {
1673 $return_value = (1/($num_individuals-1)*$sum) ** 0.5;
1678 end sd
1680 # }}} sd
1682 # {{{ range
1684 start range
1686 # See L</max>.
1687 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1688 if ( defined $self -> {'range'}[$tmp_column] ) {
1689 $return_value = $self -> {'range'}[$tmp_column];
1690 } else {
1691 my $old_target = $self -> {'target'};
1692 $self -> {'target'} = 'mem';
1693 $self -> synchronize;
1694 $return_value = $self -> max( column => $column,
1695 column_head => $column_head ) -
1696 $self -> min( column => $column,
1697 column_head => $column_head );
1698 $self -> {'range'}[$column] = $return_value;
1699 if ( $old_target eq 'disk' ) {
1700 $self -> flush if ( $self -> {'target'} eq 'disk' );
1701 $self -> {'target'} = 'disk';
1705 end range
1707 # }}} range
1709 # {{{ recalc_column
1711 start recalc_column
1713 # Recalculates a column based on expression. Also, see L</max>.
1714 $self -> synchronize;
1716 # Check if $column(-index) is defined and valid, else try to find index using column_head
1717 my $first_id = $self -> {'individuals'}[0];
1718 die "data -> recalc_column: No individuals defined in data object based on ",
1719 $self -> full_name,"\n" unless defined $first_id;
1721 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1723 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1724 if(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1725 die "Error in data -> recalc_column: unknown column: \"$column_head\" or column number: \"$column\"\n";
1726 } else {
1727 $column = $self -> {'column_head_indices'}{$column_head};
1731 for my $individual ( @{$self -> {'individuals'}} ) {
1732 $individual -> recalc_column( column => $column,
1733 expression => $expression );
1736 end recalc_column
1738 # }}} recalc_column
1740 # {{{ renumber_ascending
1742 start renumber_ascending
1744 # Renumbers the individuals (changes the subject identifiers) so that
1745 # all have unique integer numbers starting with start_at and
1746 # ascending. The primary use of this
1747 # method is not to order the individuals after their identifiers but to
1748 # ensure that all individuals have unique identifiers.
1750 $self -> synchronize;
1751 foreach my $individual ( @{$self -> {'individuals'}} ) {
1752 $individual -> idnumber ( $start_at++ );
1754 $self -> {'synced'} = 0;
1756 end renumber_ascending
1758 # }}} renumber_ascending
1760 # {{{ renumber_descending
1762 start renumber_descending
1764 # See L</renumber_ascending>.
1765 $self -> synchronize;
1766 foreach my $individual ( @{$self -> {'individuals'}} ) {
1767 $individual -> idnumber ( $start_at-- );
1769 $self -> {'synced'} = 0;
1771 end renumber_descending
1773 # }}} renumber_descending
1775 # {{{ single_valued_data
1777 start single_valued_data
1779 # Usage:
1781 # ($single_value_data_set, $remainder, $column_indexes) =
1782 # $data_object -> single_valued_data( subset_name => 'subset.dta',
1783 # remainder_name => 'remainder.dta',
1784 # target => 'disk',
1785 # do_not_test_columns => [1..18,24,26];
1787 # my $single_value_column_indexes = $column_indexes -> [0];
1788 # my $all_other_column_indexes = $column_indexes -> [1];
1790 # Analyses the content of each column, based on the
1791 # ID column, and returns two new data objects: One
1792 # that contains all columns that is has only one value per
1793 # individual and one that contains the
1794 # remainding data. This is useful for creating compact 'extra'
1795 # data sets that can be read in via user-defined sub-routines
1796 # when the number of columns needed exceeds the maximum that
1797 # NONMEM allows (e.g. 20 in NONMEM version V).
1799 # The I<do_not_test_columns> argument specifies on which columns
1800 # to skip the single value test
1802 my @multi_value_flags;
1803 my @individuals = @{$self -> {'individuals'}};
1804 # Initiate the flags:
1805 if ( defined $individuals[0] ) {
1806 my @data = @{$individuals[0] -> {'subject_data'}};
1807 my @data_row = split( /,/ , $data[0] );
1808 for ( my $i = 0; $i < scalar @data_row; $i++ ) {
1809 my $dnt_flag = 0;
1810 foreach my $dntc ( @do_not_test_columns ) {
1811 $dnt_flag = 1 if ( $i == $dntc - 1 );
1813 $multi_value_flags[$i] = $dnt_flag;
1815 } else {
1816 die "data -> single_valued_data: No data in ID number 1\n";
1818 # Collect the stats
1819 for ( my $id = 0; $id <= $#individuals; $id++ ) {
1820 my @data = @{$individuals[$id] -> {'subject_data'}};
1821 my @data_row = split( /,/, $data[0] );
1822 for ( my $j = 0; $j < scalar @data_row; $j++ ) {
1823 my %col_unique;
1824 for ( my $i = 0; $i <= $#data; $i++ ) {
1825 my @data_row = split( /,/ , $data[$i] );
1826 $col_unique{$data_row[$j]}++;
1828 my $factors = scalar keys %col_unique;
1829 $multi_value_flags[$j]++ if ( $factors > 1 );
1832 for ( my $i = 0; $i <= $#multi_value_flags; $i++ ) {
1833 if ( $multi_value_flags[$i] ) {
1834 push ( @{$column_indexes[1]}, $i + 1);
1835 } else {
1836 push ( @{$column_indexes[0]}, $i + 1);
1839 ( $single_value_data_set, $remainder ) =
1840 $self -> subset_vertically( column_indexes => $column_indexes[0],
1841 subset_name => $subset_name,
1842 return_remainder => 1,
1843 remainder_name => $remainder_name,
1844 target => $target,
1845 keep_first_row_only => 1);
1847 end single_valued_data
1849 # }}}
1851 # {{{ subset_vertically
1853 start subset_vertically
1855 # Usage:
1857 # $subset = $data_object -> subset_vertically ( column_indexes => [1,2,6],
1858 # subset_name => 'subset.dta' );
1860 # This basic usage returns a new data object containing
1861 # columns 1,2 and 6 from the original data plus the
1862 # idcolumn. The new data object will be associated with the
1863 # file 'subset.dta'.
1865 # You get the remaining data, i.e. the original data minus
1866 # the created subset by specifying
1868 # ( $subset, $remainder ) =
1869 # $data_object -> subset_vertically ( column_indexes => [1,2,6],
1870 # subset_name => 'subset.dta',
1871 # return_remainder => 1,
1872 # remainder_name => 'remainder.dta' );
1874 # If you would like to flush the created data sets to disk and
1875 # save memory, set the I<target> argument to 'disk'. The
1876 # default value 'mem' will keep the whole data object in RAM.
1878 # The I<keep_first_row_only> argument can be used to reduce
1879 # the size of the subset data obejct by excluding all but the
1880 # first row of data from each individual.
1882 my @individuals = @{$self -> {'individuals'}};
1883 # Create remainder index array if necessary
1884 my @remainder_indexes;
1885 if ( defined $individuals[0] ) {
1886 my @data = @{$individuals[0] -> {'subject_data'}};
1887 my $idcolumn = $individuals[0] -> {'idcolumn'};
1888 # print "IC: $idcolumn\n";
1889 my $id_flag = 0;
1890 foreach my $use_index ( @column_indexes ) {
1891 $id_flag = 1 if ( $use_index == $idcolumn );
1893 if ( $return_remainder ) {
1894 # @remainder_indexes = ( $idcolumn );
1895 for ( my $i = 0; $i < scalar split(/,/,$data[0]); $i++ ) {
1896 my $rem_flag = 1;
1897 foreach my $use_index ( @column_indexes ) {
1898 $rem_flag = 0 if ( $i == $use_index -1 );
1899 # or
1900 # $i == $idcolumn -1 );
1902 push( @remainder_indexes, $i + 1 ) if ( $rem_flag );
1904 unshift( @remainder_indexes, $idcolumn ) if ( $id_flag );
1906 unshift( @column_indexes, $idcolumn ) unless ( $id_flag );
1907 } else {
1908 die "data -> single_valued_data: No data in ID number 1\n";
1911 my @new_ids;
1912 my @new_ids_2;
1913 for ( my $id = 0; $id <= $#individuals; $id++ ) {
1914 my $idnumber = $individuals[$id] -> idnumber;
1915 my $idcolumn = $individuals[$id] -> idcolumn;
1916 my @data = @{$individuals[$id] -> {'subject_data'}};
1917 my @new_data;
1918 my @new_data_2;
1919 my $use_rows = $keep_first_row_only ? 0 : $#data;
1920 for ( my $i = 0; $i <= $use_rows; $i++ ) {
1921 my @new_row;
1922 my @data_row = split( /,/, $data[$i] );
1923 foreach my $use_index ( @column_indexes ) {
1924 push( @new_row, $data_row[$use_index-1] );
1926 # print "@new_row $#new_row\n";
1927 push( @new_data, join( ',', @new_row ) );
1929 for ( my $i = 0; $i <= $#data; $i++ ) {
1930 if ( $return_remainder ) {
1931 my @new_row_2;
1932 my @data_row = split( /,/, $data[$i] );
1933 foreach my $use_index ( @remainder_indexes ) {
1934 push( @new_row_2, $data_row[$use_index-1] );
1936 # print "@new_row_2 $#new_row_2\n";
1937 push( @new_data_2, join( ',' , @new_row_2 ) );
1940 my $new_id = data::individual -> new( idnumber => $idnumber,
1941 idcolumn => $idcolumn,
1942 subject_data => \@new_data );
1943 push( @new_ids, $new_id );
1944 if ( $return_remainder ) {
1945 my $new_id_2;
1946 $new_id_2 = data::individual -> new( idnumber => $idnumber,
1947 idcolumn => $idcolumn,
1948 subject_data => \@new_data_2 );
1949 push( @new_ids_2, $new_id_2 );
1952 my @header = @{$self -> {'header'}};
1953 my @new_header;
1954 foreach my $use_index ( @column_indexes ) {
1955 push( @new_header, @header[$use_index-1] );
1957 my $comment;
1958 if( defined $self -> {'comment'} ){
1959 my @comment = @{$self -> {'comment'}};
1960 $comment = \@comment;
1962 $subset = data -> new ( filename => $subset_name,
1963 directory => $self -> {'directory'},
1964 ignoresign => $self -> {'ignoresign'},
1965 header => \@new_header,
1966 comment => $comment,
1967 individuals => \@new_ids,
1968 target => $target,
1969 ignore_missing_files => 1 );
1970 if ( $return_remainder ) {
1971 my @new_header_2;
1972 foreach my $use_index ( @remainder_indexes ) {
1973 push( @new_header_2, @header[$use_index-1] );
1975 $remainder = data -> new ( filename => $remainder_name,
1976 directory => $self -> {'directory'},
1977 ignoresign => $self -> {'ignoresign'},
1978 header => \@new_header_2,
1979 comment => $comment,
1980 individuals => \@new_ids_2,
1981 target => $target,
1982 ignore_missing_files => 1 );
1985 end subset_vertically
1987 # }}}
1989 # {{{ subsets
1991 start subsets
1993 # if ( defined $expression and defined $bins ) {
1994 # die "data -> subset: expression and bins may not both be specified\n";
1996 # if ( not ( defined $expression or defined $bins ) ) {
1997 # die "data -> subset: expression or bins must be specified\n";
1999 $self -> synchronize;
2000 my @header = @{$self -> {'header'}};
2001 my @comment = defined $self -> {'comment'} ? @{$self -> {'comment'}} : ();
2002 my @subset_ids= ();
2003 my %rnd_ids;
2004 my $key = 0;
2005 my @ids = @{$self -> {'individuals'}};
2006 if ( defined $stratify_on ) {
2007 my $work_data = $self -> copy( filename => 'work_data.dta',
2008 target => 'mem' );
2009 my %strata = %{$work_data -> factors( column => $stratify_on )};
2010 # $Data::Dumper::Maxdepth = 1;
2011 # print Dumper \%strata;
2013 while ( my ( $factor, $keys ) = each %strata ) {
2014 foreach my $key ( @{$keys} ) {
2015 my $rnd_num = rand;
2016 while ( defined $rnd_ids{$factor}{$rnd_num} ) {
2017 $rnd_num = rand;
2019 $rnd_ids{$factor}{$rnd_num} = $ids[$key];
2022 my $first = 1;
2023 while ( my ( $factor, $rnd_nums ) = each %rnd_ids ) {
2024 my @sort_rnd_nums = sort { $a <=> $b } keys %{$rnd_nums};
2025 for ( my $i = 0; $i <= $#sort_rnd_nums; $i ) {
2026 for ( my $j = 0; $j < $bins; $j++ ) {
2027 if ( $first ) {
2028 push( @subset_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy] );
2029 push( @incl_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber] );
2030 } else {
2031 push( @{$subset_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy );
2032 push( @{$incl_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber );
2034 $i++;
2035 last if $i > $#sort_rnd_nums;
2037 $first = 0;
2040 for ( my $j = 0; $j < $bins; $j++ ) {
2041 my $sdata = data -> new ( header => \@header,
2042 comment => \@comment,
2043 ignoresign => $self -> {'ignoresign'},
2044 individuals => $subset_ids[$j],
2045 ignore_missing_files => 1,
2046 target => 'disk',
2047 idcolumn => $self -> {'idcolumn'},
2048 filename => "subset_$j.dta" );
2049 #$sdata -> _write;
2050 push( @subsets, $sdata );
2052 } else {
2053 for ( my $i = 0; $i <= $#ids; $i++ ) {
2054 my $rnd_num = rand;
2055 while ( defined $rnd_ids{$rnd_num} ) {
2056 $rnd_num = rand;
2058 $rnd_ids{$rnd_num} = $ids[$i];
2060 my @keys = sort { $a <=> $b } keys %rnd_ids;
2061 my $first = 1;
2062 for ( my $i = 0; $i <= $#keys; $i ) {
2063 for ( my $j = 0; $j < $bins; $j++ ) {
2064 if ( $first ) {
2065 push( @subset_ids, [$rnd_ids{$keys[$i]} -> copy] );
2066 push( @incl_ids, [$rnd_ids{$keys[$i]} -> idnumber] );
2067 } else {
2068 push( @{$subset_ids[$j]}, $rnd_ids{$keys[$i]} -> copy );
2069 push( @{$incl_ids[$j]}, $rnd_ids{$keys[$i]} -> idnumber );
2071 $i++;
2072 last if $i > $#keys;
2074 $first = 0;
2076 for ( my $j = 0; $j < $bins; $j++ ) {
2077 my $sdata = data -> new ( header => \@header,
2078 comment => \@comment,
2079 ignoresign => $self -> {'ignoresign'},
2080 individuals => $subset_ids[$j],
2081 ignore_missing_files => 1,
2082 target => $target,
2083 idcolumn => $self -> {'idcolumn'},
2084 filename => "subset_$j.dta" );
2085 #$sdata -> _write;
2086 push( @subsets, $sdata );
2090 end subsets
2092 # }}} subsets
2094 # {{{ subset
2096 start subset
2098 $self -> synchronize;
2099 my @header = @{$self -> {'header'}};
2100 my @comment = defined $self -> {'comment'} ? @{$self -> {'comment'}} : ();
2101 my @subset_inds = ();
2102 my $key = 0;
2103 foreach my $individual ( @{$self -> {'individuals'}} ) {
2104 if ( $individual -> evaluate_expression( column => $based_on,
2105 expression => $expression ) ) {
2106 push( @subset_inds, $individual -> copy );
2107 push( @incl_individuals, $individual -> idnumber );
2108 push( @included_keys, $key );
2110 $key++;
2112 $subset = data -> new ( header => \@header,
2113 comment => \@comment,
2114 ignoresign => $self -> {'ignoresign'},
2115 individuals => \@subset_inds,
2116 idcolumn => $self -> {'idcolumn'},
2117 filename => "subset.dta" );
2119 end subset
2121 # }}} subset
2123 # {{{ target
2125 start target
2127 if ( $parm eq 'disk' and $self -> {'target'} eq 'mem' ) {
2128 $self -> {'target'} = 'disk';
2129 $self -> flush;
2130 } elsif ( $parm eq 'mem' and $self -> {'target'} eq 'disk' ) {
2131 $self -> {'target'} = 'mem';
2132 $self -> synchronize;
2135 end target
2137 # }}}
2139 # {{{ _write
2141 start _write
2143 die "ERROR: data -> _write: No filename set in data object.\n"
2144 if( $filename eq '' );
2146 # $Data::Dumper::Maxdepth = 2;
2147 # die Dumper $self -> {'individuals'};
2149 if( not defined $self -> {'individuals'} ){
2151 # If we don't have any individuals and write to a new
2152 # filename, we must first read individuals from the old
2153 # file. A call to synchronize will do that. There is no risk
2154 # of a infinite loop here since synchronize allways writes to
2155 # "full_name".
2157 unless( $filename eq $self -> full_name ){
2158 $self -> synchronize;
2162 open(FILE,">$filename") ||
2163 die "Could not create $filename\n";
2164 my $data_ref = $self -> format_data;
2165 my @data = @{$data_ref};
2166 for ( @data ) {
2167 print ( FILE );
2169 close(FILE);
2171 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2172 # $self -> {'use_data_table'} ) {
2173 # # Backslashes messes up the sql syntax
2174 # my $file_str = $self->{'filename'};
2175 # my $dir_str = $self->{'directory'};
2176 # $file_str =~ s/\\/\//g;
2177 # $dir_str =~ s/\\/\//g;
2179 # # md5sum
2180 # my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
2181 # my ( $date_str, $time_str );
2182 # if ( $Config{osname} eq 'MSWin32' ) {
2183 # $date_str = `date /T`;
2184 # $time_str = ' '.`time /T`;
2185 # } else {
2186 # # Assuming UNIX
2187 # $date_str = `date`;
2189 # chomp($date_str);
2190 # chomp($time_str);
2191 # my $date_time = $date_str.$time_str;
2192 # my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2193 # ";databse=".$PsN::config -> {'_'} -> {'project'},
2194 # $PsN::config -> {'_'} -> {'user'},
2195 # $PsN::config -> {'_'} -> {'password'},
2197 # 'RaiseError' => 1});
2198 # my $sth;
2199 # if ( defined $self -> {'data_id'} ) {
2200 # $sth = $dbh -> prepare( "UPDATE ".$PsN::config -> {'_'} -> {'project'}.
2201 # ".data ".
2202 # "SET filename='$file_str',date='$date_time',".
2203 # "directory='$dir_str',md5sum='$md5sum' ".
2204 # "WHERE data_id='".$self -> {'data_id'}."'" );
2205 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2206 # } else {
2207 # $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2208 # ".data (filename,date,directory,md5sum) ".
2209 # "VALUES ('$file_str', '$date_time', '$dir_str','".
2210 # $md5sum."' )");
2211 # $sth -> execute;
2212 # $self -> {'data_id'} = $sth->{'mysql_insertid'};
2214 # $sth -> finish;
2215 # $dbh -> disconnect;
2218 end _write
2220 # }}} _write
2222 # {{{ flush
2224 start flush
2226 # synchronizes the object with the file on disk and empties
2227 # most of the objects attributes to save memory.
2228 if( defined $self -> {'individuals'} and
2229 ( !$self -> {'synced'} or $force ) ) {
2230 $self -> _write;
2232 # $self -> {'header'} = undef;
2233 $self -> {'comment'} = undef;
2234 $self -> {'individuals'} = undef;
2235 $self -> {'synced'} = 0;
2236 # $self -> {'column_head_indices'} = undef; # kajsa comment 080211
2237 $self -> {'have_missing_data'} = undef;
2239 end flush
2241 # }}} flush
2243 # {{{ synchronize
2245 start synchronize
2247 # synchronizes the object with the file on disk
2248 unless( $self -> {'synced'} ){
2249 if( defined $self -> {'individuals'} and
2250 scalar @{$self -> {'individuals'}} > 0 ){
2251 # We should not read new data from file if we
2252 # have an individuals defined?
2253 # Perhaps there should be an attribute
2254 # 'from_file' that overrides this and reads in
2255 # the data from the file specified in filename
2256 # and overwrites whatever the object already
2257 # contains?
2258 # if( -e $self -> {'filename'} ){
2259 # $self -> _read_header;
2260 # $self -> _read_individuals;
2262 $self -> _write;
2263 } else {
2264 if( -e $self -> full_name ){
2265 unless( defined $self -> {'header'} and scalar @{$self -> {'header'}} > 0 ){
2266 $self -> _read_header;
2268 $self -> _read_individuals;
2269 } else {
2270 debug -> die( message => "Fatal error: datafile: " . $self -> full_name . " does not exist." );
2271 return;
2275 my $i = 1;
2276 foreach my $head ( @{$self -> {'header'}} ){
2277 $self -> {'column_head_indices'} -> {$head} = $i;
2278 $i++;
2280 $self -> {'synced'} = 1;
2282 end synchronize
2284 # }}} synchronize
2286 # {{{ _fisher_yates_shuffle
2288 start _fisher_yates_shuffle
2290 my $arr_ref = $parm{'array'};
2291 debug -> warn( level => 1,
2292 message => "Array of zero length received" )
2293 if ( scalar @{$arr_ref} < 1 );
2294 my $i;
2295 for ($i = @$arr_ref; --$i; ) {
2296 my $j = random_uniform_integer(1,0,$i);
2297 # my $j = int rand ($i+1);
2298 # print "$j $j_new\n";
2299 @$arr_ref[$i,$j] = @$arr_ref[$j,$i];
2302 end _fisher_yates_shuffle
2304 # }}} _fisher_yates_shuffle
2306 # {{{ _read_header
2308 start _read_header
2310 my $filename = $self -> full_name;
2311 my $ignoresign = $self -> ignoresign;
2312 my ( @data, @new_record, $row, $tmp_row, @header, $hdrstring );
2314 open(DATAFILE,"$filename") ||
2315 die "Could not open $filename for reading";
2316 my $columns;
2317 while (<DATAFILE>) {
2318 s/\s*\,\s*/\,/g;
2319 $tmp_row = $_;
2320 # @new_record = split(/\,|\s+/,$_);
2321 if ( ! (/^\s*\d+|^\s*\./) ) {
2322 $data[$row] = $tmp_row;
2323 $row++;
2324 } else {
2325 # We have reached the first data-row, return.
2326 $columns = scalar split(/\,\s*|\s+/);
2327 last;
2330 close(DATAFILE);
2332 if ( defined $self -> {'cont_column'} and not $self -> {'table_file'} ) {
2333 my $data_len = $#data;
2334 for ( my $i = $data_len; $i >= 0; $i-- ) {
2335 my @arr = split(/\,\s*|\s+/,$data[$i]);
2336 if ( $arr[$self -> {'cont_column'}-1] eq 'CONT' ) {
2337 my $start = $i == $data_len ? 0 : 1;
2338 for ( my $j = $start; $j <= $#arr; $j++ ) {
2339 if ( $j != ($self -> {'cont_column'}-1) ) {
2340 push( @header, $arr[$j] );
2343 pop( @data );
2346 # the \Q and \E here are to escape wierd ignoresigns
2347 $header[0] =~ s/\Q$ignoresign\E//
2348 if ( defined $self->ignoresign );
2349 shift( @header ) if ( $header[0] eq "" );
2350 } else {
2351 chomp( $hdrstring = pop(@data));
2352 @header = split(/\,\s*|\s+/,$hdrstring);
2353 # the \Q and \E here are to escape wierd ignoresigns
2354 $header[0] =~ s/\Q$ignoresign\E//
2355 if ( defined $self->ignoresign );
2356 shift( @header ) if ( $header[0] eq "" );
2357 if( $self -> {'table_file'} ) {
2358 my @new_header;
2359 for( my $i = 1; $i <= scalar @header; $i++ ) {
2360 if( $header[$i-1] eq 'CONT' ) {
2361 if ( defined $self -> {'cont_column'} and not $i == $self -> {'cont_column'} ) {
2362 debug -> warn( level => 1,
2363 message => "The supplied columns for the CONT data item (".
2364 $self -> {'cont_column'}.") does not match the column where the CONT ".
2365 "header was found ($i), using $i" );
2367 $self -> {'cont_column'} = $i;
2368 } else {
2369 push( @new_header, $header[$i-1] );
2372 @header = @new_header;
2373 for( my $i = 1; $i <= scalar @header; $i++ ) {
2374 if( $header[$i-1] eq 'ID' ) {
2375 if ( defined $self -> {'idcolumn'} and not $i == $self -> {'idcolumn'} ) {
2376 debug -> warn( level => 1,
2377 message => "The supplied columns for the ID data item (".
2378 $self -> {'idcolumn'}.") does not match the column where the CONT ".
2379 "header was found ($i), using $i" );
2381 $self -> {'idcolumn'} = $i;
2387 # I'm not certain on how to deal with this conflict. I'm leaving it commented because I believe this code should not be here.
2389 #<<<<<<< data_subs.pm
2390 # $header[0] =~ s/$ignoresign//
2391 # if ( defined $self->ignoresign );
2392 # shift( @header ) if ( $header[0] eq "" );
2393 #=======
2394 #>>>>>>> 1.28
2396 # It is ok with data sets without a header.
2397 # unless( scalar @header > 0 ){ debug -> die( message => 'Datafile ' . $self -> full_name . ' is empty.' ); }
2399 $self -> {'header'} = \@header;
2400 $self -> {'comment'} = \@data;
2401 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2402 # $self -> {'use_data_table'} ) {
2403 # my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2404 # ";databse=".$PsN::config -> {'_'} -> {'project'},
2405 # $PsN::config -> {'_'} -> {'user'},
2406 # $PsN::config -> {'_'} -> {'password'},
2407 # {'RaiseError' => 1});
2408 # if ( scalar @header < 1 ) {
2409 # for ( my $i = 1; $i <= $columns; $i++ ) {
2410 # push( @header, $i );
2413 # for ( my $i = 0; $i <= $#header; $i++ ) {
2414 # my $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2415 # ".data_column ".
2416 # "(name,number,data_id) ".
2417 # "VALUES ('".$header[$i]."', '".($i+1).
2418 # "', '".$self -> {'data_id'}."' )");
2419 # $sth -> execute;
2420 # push( @{$self -> {'data_column_ids'}}, $sth->{'mysql_insertid'} );
2421 # $sth -> finish;
2423 # $dbh -> disconnect;
2426 end _read_header
2428 # }}} _read_header
2430 # {{{ _read_individuals
2432 start _read_individuals
2434 my $idcol = $self -> idcolumn;
2435 my $filename = $self -> full_name;
2436 #debug -> warn( level => 1,
2437 # message => "Building array of individuals from file " . $self -> {'filename'} );
2438 open(DATAFILE,"$filename") ||
2439 die "Could not open $filename for reading";
2440 my ( @new_row, $new_ID, $old_ID, @init_data );
2441 my $buffer;
2442 my $lines = 0;
2443 while (sysread DATAFILE, $buffer, 4096) {
2444 $lines += ($buffer =~ tr/\n//);
2446 seek( DATAFILE, 0,0 );
2448 # For status bar:
2449 my $status_bar = status_bar -> new( steps => $lines );
2451 ui -> print( category => 'scm',
2452 message => "Reading data file: ".$self -> filename );
2453 ui -> print( category => 'scm',
2454 message => $status_bar -> print_step(),
2455 newline => 0);
2457 my ( $sth, $dbh, $first_row_id, $first_value_id );
2458 my $insert = 1;
2460 my $insert_rows;
2461 my $insert_values;
2462 my $row_counter = 0;
2463 # my $individual_counter = 0;
2464 my $table_counter = 0;
2465 my $full_row;
2466 ROW: while ( <DATAFILE> ) {
2467 #scan to first table to read. Will do next ROW until table header $skip_tables+1
2468 if (($skip_tables>0) && ($table_counter<= $skip_tables)){
2469 #if want to read only part of file and have not found start place yet
2470 if ( /^TABLE / ) {
2471 $table_counter++;
2473 next ROW;
2476 s/^ *//;
2477 s/\s*\,\s*/\,/g;
2478 my @new_row = split(/\,\s*|\s+/);
2479 # This regexp check is not time consuming.
2480 if ( /^\s*\d+|^\s*\./ ) {
2481 if ( defined $self -> {'cont_column'} ) {
2482 if ( $new_row[$self -> {'cont_column'} - 1] == 1 ) {
2483 if ( not $self -> {'table_file'} ) { # Skip the CONT=1 rows if this is a table file
2484 for ( my $i = $#new_row; $i > 0; $i-- ) {
2485 if ( $i != ($self -> {'cont_column'} - 1) ) {
2486 unshift( @{$full_row}, $new_row[$i] );
2490 next ROW;
2491 } else {
2492 for ( my $i = $#new_row; $i >= 0; $i-- ) {
2493 # if ( $i != ($self -> {'cont_column'} - 1) or $self -> {'table_file'} ) {
2494 if ( $i != ($self -> {'cont_column'} - 1) ) {
2495 unshift( @{$full_row}, $new_row[$i] );
2499 } else {
2500 @{$full_row} = @new_row;
2502 $new_ID = $full_row -> [$idcol-1]; # index starts at 0
2503 $old_ID = $new_ID if ( not defined $old_ID );
2505 #If we have not yet found first individual to read, then
2506 #count each time new individual found. If new individual is
2507 #first individual to read, then reset old_ID so that
2508 #the individual's lines will be read before new individual is pushed.
2509 #If we have not found first individual to read then skip to next line in file
2510 #if ($skip_individuals > $individual_counter) {
2511 ##can only enter here if skip_individuals > 0 and have not started reading
2512 #if ( $new_ID != $old_ID ) {
2513 # $individual_counter++;
2515 #$old_ID = $new_ID;
2516 #unless ($skip_individuals == $individual_counter) {
2517 # #discard what has been read and skip to next line
2518 # $full_row = undef;
2519 # next ROW;
2524 # Check if column miss data at some row (This adds about 30% of init time)
2525 my $mdt = $self -> {'missing_data_token'};
2526 for( my $i = 0; $i <= $#{$full_row}; $i++ ){
2527 $self -> {'have_missing_data'} -> {$i+1} = 1
2528 if( $full_row -> [$i] == $mdt ); # == is slower but safer than eq
2531 if ( $new_ID != $old_ID ) {
2532 my @subject_data = @init_data;
2533 my $id = data::individual -> new ( idcolumn => $idcol,
2534 subject_data => \@subject_data,
2535 data_id => $self -> {'data_id'} );
2536 push( @{$self -> {'individuals'}}, $id );
2537 #check if have stored max number individuals
2538 if ($max_individuals > 0) {
2539 if (scalar(@{$self -> {'individuals'}}) == $max_individuals){
2540 @init_data =(); #prevent any more rows from being stored after loop
2541 last ROW;
2545 @init_data =(join( ",", @{$full_row}));
2546 } else {
2547 push( @init_data, join( ",", @{$full_row}) );
2549 $old_ID = $new_ID;
2550 $full_row = undef;
2552 if ( $status_bar -> tick() ) {
2553 ui -> print( category => 'scm',
2554 message => $status_bar -> print_step(),
2555 wrap => 0,
2556 newline => 0 );
2560 # if we have ended loop because of max number individuals, init_data will be empty.
2561 if ( $#init_data >= 0 ) {
2562 push( @{$self -> {'individuals'}},
2563 data::individual -> new ( idcolumn => $idcol,
2564 subject_data => \@init_data ) );
2566 ui -> print( category => 'scm',
2567 message => " ... done" );
2568 close(DATAFILE);
2569 # $self -> _write( filename => 'test.dta' );
2571 end _read_individuals
2573 # }}} _read_individuals
2575 # {{{ create_row_filter
2577 start create_row_filter
2579 unless ($self -> {'synced'}){
2580 'debug' -> die(message => "Cannot use this function with non-synced data.");
2583 my %index_hash=();
2584 my $index=0;
2585 my $keep;
2586 my $ind_counter=0;
2588 my $count=scalar(@{$self -> {'individuals'}});
2589 if ($no_individuals < 1){
2590 'debug' -> die(message =>"Requested filter length must be at least 1 individual.");
2592 if ($no_individuals > $count){
2593 'debug' -> die(message =>"Requested filter length $no_individuals individuals is larger ".
2594 "than number of individuals in loaded dataset $count.");
2598 # if ( not $column =~ /^\d/ ) {
2599 # $column = $self -> {'column_head_indices'} -> {$column} - 1;
2602 foreach my $column_name (@{$self -> {'header'}}){
2603 #could add if-statement here to allow different filter types.
2604 if ($column_name =~ /^(MDV)$/){
2605 if (exists $index_hash{$1}){
2606 'debug' -> die(message =>"Found column header $1 twice, bailing out.");
2608 $index_hash{$1}=$index;
2610 $index++;
2613 #if no columns to filter on were found, return empty filter array
2614 #which means "keep everything". Otherwise enter loop below.
2616 unless (scalar(keys %index_hash) < 1){
2617 foreach my $individual ( @{$self -> {'individuals'}} ) {
2618 foreach my $datarow ( @{$individual -> {'subject_data'}}){
2619 $keep = 1;
2620 my @row = split( /,/ , $datarow );
2621 foreach my $key (keys %index_hash){
2622 if ($row[$index_hash{$key}] != 0){
2623 $keep = 0;
2624 last;
2627 push (@filter,$keep);
2629 $ind_counter++;
2630 if ($ind_counter == $no_individuals){
2631 last;
2637 end create_row_filter
2639 # }}} create_row_filter
2641 # {{{ append_columns_to_matrix
2643 start append_columns_to_matrix
2645 my $row_count = scalar(@{$matrix}); #avreferera
2646 my ($column_count,$offset);
2648 if ($row_count < 1){
2649 'debug' -> die(message => "Matrix cannot be empty in append_columns_to matrix.");
2651 if (scalar(@columns)<1){
2652 'debug' -> die(message => "Column array cannot be empty in append_columns_to matrix.");
2654 if (scalar(@columns)%$row_count > 0){
2655 'debug' -> die(message => "Column element count must be multiple of row count in matrix.");
2658 $column_count = scalar(@columns)/$row_count;
2660 for (my $i=0; $i<$column_count; $i++){
2661 $offset=$i*$row_count;
2662 for (my $j=0; $j<$row_count; $j++){
2663 ${$matrix}[$j] .= ','. $columns[$offset+$j];
2667 end append_columns_to_matrix
2669 # }}} append_columns_to_matrix