bootstrap memleak plugged
[PsN.git] / lib / data_subs.pm
blob676910078358da5f0bfd5e1f4d8546ab12ad9f91
1 This file contains the code for the subroutines in
2 Perl-speaks-NONMEMs data module. It is not functional by itself.
3 The code should be transferred to the module autogenerated by dia2code
4 using the fill_diacode.pl script.
6 TODO: Fix the synced attribute so that all methods that change the state of the
7 data object sets synced=0
9 # {{{ include
11 start include statements
12 use Digest::MD5 'md5_hex';
13 use OSspecific;
14 use File::Copy "cp";
15 use Carp;
16 use Carp qw(cluck);
17 use Config;
18 use Math::Random;
19 use Storable;
20 use debug;
21 use ui;
22 use status_bar;
23 use Data::Dumper;
24 use Time::HiRes qw(gettimeofday);
25 my @primary_column_names = ('ID', 'DATE', 'DAT1', 'DAT2', 'DAT3' ,'L1', 'L2', 'DV', 'MDV', 'RAW_', 'MRG_', 'RPT_', 'TIME', 'DROP', 'SKIP', 'EVID', 'AMT', 'RATE', 'SS', 'II', 'ADDL', 'CMT', 'PCMT', 'CALL');
26 end include
28 # }}} include statements
30 # {{{ description
32 start description
33 # The structure of the data class is subject-centric, recognising that
34 # the subjects included in a study often can be regarded as
35 # independent. A class for the subject level exists within PsN and is
36 # called the individual class. A data object consists of at least one
37 # but probably many individual objects plus optional comments.
38 end description
40 # }}} description
42 # {{{ synopsis
44 start synopsis
45 # use data;
47 # my $data_obj = data -> new ( filename => 'test040314.dta' );
49 # $data_obj -> renumber_ascending;
51 # my $subsets_ref = $data_obj -> case_deletion( bins => 10 );
53 # my @subsets = @{$subsets_ref};
54 end synopsis
56 # }}} synopsis
58 # {{{ see_also
60 start see_also
61 # =begin html
63 # <a HREF="model.html">model</a>, <a HREF="output.html">output</a>,
64 # <a HREF="tool/modelfit.html">tool::modelfit</a>,
65 # <a HREF="tool.html">tool</a>
67 # =end html
69 # =begin man
71 # model, output, tool::modelfit, tool
73 # =end man
74 end see_also
76 # }}} see_also
78 # {{{ new
80 start new
82 # If the column holding the subject identifier is not the
83 # first, it can be specified using the I<idcolumn> attribute
85 # I<ignoresign> determines which rows that are regarded as
86 # comments. Corresponds to the IGNORE= option in the $DATA
87 # record in a NONMEM model file.
89 $this -> {'use_data_table'} = 0;
91 ( $this -> {'directory'},
92 $this -> {'filename'} ) = OSspecific::absolute_path( $this -> {'directory'},
93 $this->{'filename'} );
95 debug -> warn( level => 2,
96 message => "data -> new: Data object initialized from file: ".
97 $this -> full_name );
99 # sub register_in_database {
100 # my $this = shift;
101 # # Backslashes messes up the sql syntax
102 # my $file_str = $this->{'filename'};
103 # my $dir_str = $this->{'directory'};
104 # $file_str =~ s/\\/\//g;
105 # $dir_str =~ s/\\/\//g;
107 # # md5sum
108 # my $md5sum = md5_hex(OSspecific::slurp_file($this-> full_name ));
109 # my $dbh = DBI ->
110 # connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
111 # ";databse=".$PsN::config -> {'_'} -> {'project'},
112 # $PsN::config -> {'_'} -> {'user'},
113 # $PsN::config -> {'_'} -> {'password'},
114 # {'RaiseError' => 1});
115 # my $sth;
116 # my $sth = $dbh -> prepare( "SELECT data_id FROM ".$PsN::config -> {'_'} -> {'project'}.
117 # ".data ".
118 # "WHERE filename = '$file_str' AND ".
119 # "directory = '$dir_str' AND ".
120 # "md5sum = '".$md5sum."'" );
121 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
122 # my $select_arr = $sth -> fetchall_arrayref;
123 # if ( scalar @{$select_arr} > 0 ) {
124 # debug -> warn( level => 1,
125 # message => "Found an old entry in the database matching the ".
126 # "current data file" );
127 # if ( scalar @{$select_arr} > 1 ) {
128 # debug -> warn( level => 1,
129 # message => "Found more than one matching entry in database".
130 # ", using the first" );
132 # $this -> {'data_id'} = $select_arr->[0][0];
133 # } else {
134 # my ( $date_str, $time_str );
135 # if ( $Config{osname} eq 'MSWin32' ) {
136 # $date_str = `date /T`;
137 # $time_str = ' '.`time /T`;
138 # } else {
139 # # Assuming UNIX
140 # $date_str = `date`;
142 # chomp($date_str);
143 # chomp($time_str);
144 # my $date_time = $date_str.$time_str;
145 # $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
146 # ".data (filename,date,directory,md5sum) ".
147 # "VALUES ('$file_str', '$date_time', '$dir_str','".
148 # $md5sum."' )");
149 # $sth -> execute;
150 # $this -> {'data_id'} = $sth->{'mysql_insertid'};
152 # $sth -> finish;
153 # $dbh -> disconnect;
157 unless ( ( defined $this -> {'header'} and
158 scalar @{$this -> {'header'}} > 0 ) or
159 ( defined $this -> {'individuals'} and
160 scalar @{$this -> {'individuals'}} > 0 ) ) {
161 if ( -e $this -> full_name ) {
162 if ( $this -> {'target'} eq 'mem' ) {
163 # &register_in_database( $this ) if ( $PsN::config -> {'_'} -> {'use_database'} and
164 # $this -> {'use_data_table'} );
165 $this -> _read_header;
166 $this -> _read_individuals;
167 $this -> {'synced'} = 1;
168 } else {
169 $this -> {'synced'} = 0;
171 } else {
172 debug -> die(message => "No header, individuals, and no file " . $this -> full_name . " on disk.")
173 unless $this -> {'ignore_missing_files'};
174 $this -> {'synced'} = 0;
176 } else {
177 if ( $this -> {'target'} eq 'mem') {
178 if ( -e $this -> {'filename'} ) {
179 $this -> _read_header;
180 # &register_in_database if ( $PsN::config -> {'_'} -> {'use_database'} and
181 # $this -> {'use_data_table'} );
182 $this -> _read_individuals;
183 $this -> {'synced'} = 1;
184 } else {
185 debug -> die(message => "No file:".$this->{'filename'}." on disk" )
186 unless $this -> {'ignore_missing_files'};
187 $this -> {'synced'} = 0;
189 } else {
190 $this -> flush;
194 if ( $this -> {'synced'} ) {
195 my $i = 1;
196 foreach my $head ( @{$this -> {'header'}} ) {
197 $this -> {'column_head_indices'} -> {$head} = $i;
198 $i++;
201 # $Data::Dumper::Maxdepth = 3;
202 # die Dumper $this -> {'individuals'};
204 end new
206 # }}} new
208 # {{{ register_in_database
209 start register_in_database
210 if ( $PsN::config -> {'_'} -> {'use_database'} ) {
211 # Backslashes messes up the sql syntax
212 my $file_str = $self->{'filename'};
213 my $dir_str = $self->{'directory'};
214 $file_str =~ s/\\/\//g;
215 $dir_str =~ s/\\/\//g;
217 my $project = $PsN::config -> {'_'} -> {'project'};
218 # md5sum
219 my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
221 my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
222 ";databse=".$project,
223 $PsN::config -> {'_'} -> {'user'},
224 $PsN::config -> {'_'} -> {'password'},
225 {'RaiseError' => 1});
227 my $sth;
229 my $select_arr = [];
231 if ( not $force ) {
232 my $sth = $dbh -> prepare( "SELECT data_id FROM ".$project.
233 ".data ".
234 "WHERE filename = '$file_str' AND ".
235 "directory = '$dir_str' AND ".
236 "md5sum = '".$md5sum."'" );
237 $sth -> execute or debug -> die( message => $sth->errstr ) ;
238 $select_arr = $sth -> fetchall_arrayref;
241 if ( scalar @{$select_arr} > 0 ) {
242 'debug' -> warn( level => 1,
243 message => "Found an old entry in the database matching the ".
244 "current data file" );
245 if ( scalar @{$select_arr} > 1 ) {
246 'debug' -> warn( level => 1,
247 message => "Found more than one data matching entry in database".
248 ", using the first" );
250 $self -> {'data_id'} = $select_arr->[0][0];
251 # Find the id's
252 my $sth = $dbh -> prepare( "SELECT individual_id FROM ".$project.".data_individual ".
253 "WHERE data_id = '".$self -> {'data_id'}."'" );
254 $sth -> execute or debug -> die( message => $sth->errstr ) ;
255 my $id_arr = $sth -> fetchall_arrayref;
256 map( $_ = $_ -> [0], @{$id_arr} );
257 $self -> {'individual_ids'} = $id_arr;
258 } elsif ( defined $self -> {'individuals'} ) {
259 my ( $date_str, $time_str );
260 if( $Config{osname} eq 'MSWin32' ){
261 $date_str = `date /T`;
262 $time_str = ' '.`time /T`;
263 } else {
264 # Assuming UNIX
265 $date_str = `date`;
267 chomp($date_str);
268 chomp($time_str);
269 my $date_time = $date_str.$time_str;
270 my ( $columns, $values );
271 my $res_str = $resampled ? '1' : '0';
272 if ( defined $model_id ) {
273 $columns = '(model_id, filename, date, directory, md5sum, resampled)';
274 $values = "('$model_id', '$file_str', '$date_time', '$dir_str','".
275 $md5sum."', '$res_str' )";
276 } else {
277 $columns = '(filename, date, directory, md5sum, resampled)';
278 $values = "('$file_str', '$date_time', '$dir_str','".$md5sum."', '$res_str' )";
280 $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
281 ".data $columns VALUES $values");
282 $sth -> execute;
283 $self -> {'data_id'} = $sth->{'mysql_insertid'};
285 if ( defined $self -> {'data_id'} ) {
286 my $values;
287 my $columns = "( id_key, id )";
288 if( $#individual_ids >= 0 ) {
289 $self -> register_di_relation( individual_ids => \@individual_ids );
290 } else {
291 my $inds = scalar @{$self -> {'individuals'}};
292 $dbh -> do( "LOCK TABLES ".$PsN::config -> {'_'} -> {'project'}.
293 ".individual WRITE" );
294 # $sth = $dbh -> prepare( "SELECT MAX(individual_id)".
295 # " FROM ".$PsN::config -> {'_'} -> {'project'}.
296 # ".individual" );
297 $dbh -> do( 'USE '.$PsN::config -> {'_'} -> {'project'} );
298 $sth = $dbh -> prepare( "SHOW TABLE STATUS LIKE 'individual'" );
299 $sth -> execute or debug -> die( message => $sth->errstr ) ;
300 my $select_arr = $sth -> fetchall_arrayref;
301 my $first_id_id = $select_arr -> [0][10] ?
302 $select_arr -> [0][10] : 0;
303 # my $first_id_id = $select_arr -> [0][0] ? ($select_arr -> [0][0] + 1) : 0;
304 my $last_id_id = $first_id_id + $inds - 1;
305 for( my $i = 0; $i < $inds; $i++ ) {
306 if( defined $self -> {'individuals'}[$i] ) {
307 my $id_id = $self -> {'individuals'}[$i] -> idnumber;
308 $values = $values."," if ( defined $values );
309 $values = $values."( $i, $id_id )";
312 $sth = $dbh -> prepare( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
313 ".individual $columns VALUES $values" );
314 $sth -> execute;
315 $dbh -> do( "UNLOCK TABLES" );
316 @individual_ids = ($first_id_id .. $last_id_id);
317 $self -> register_di_relation( individual_ids => \@individual_ids );
319 $self -> {'individual_ids'} = \@individual_ids;
321 $sth -> finish;
322 $dbh -> disconnect;
324 $data_id = $self -> {'data_id'}; # return the data_id
326 end register_in_database
327 # }}} register_in_database
329 # {{{ register_di_relation
330 start register_di_relation
331 if ( $PsN::config -> {'_'} -> {'use_database'} and
332 defined $self -> {'data_id'} and $#individual_ids >= 0 ) {
333 my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
334 ";databse=".$PsN::config -> {'_'} -> {'project'},
335 $PsN::config -> {'_'} -> {'user'},
336 $PsN::config -> {'_'} -> {'password'},
337 {'raiseerror' => 1});
338 my $sth;
339 my $values;
340 my $columns = "( data_id, individual_id )";
341 foreach my $individual_id ( @individual_ids ) {
342 if ( defined $individual_id ) {
343 $values = $values."," if ( defined $values );
344 $values = $values."(".$self -> {'data_id'}.", $individual_id )";
347 $sth = $dbh -> prepare( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
348 ".data_individual $columns VALUES $values" );
349 $sth -> execute;
350 $sth -> finish if ( defined $sth );
351 $dbh -> disconnect;
353 end register_di_relation
354 # }}} register_di_relation
356 # {{{ full_name
358 start full_name
360 $full_name = $self -> {'directory'} . $self -> {'filename'};
362 end full_name
364 # }}}
366 # {{{ bootstrap
368 start bootstrap
370 # The bootstrap method draws I<samples> number of boostrap
371 # samples from the data set. The I<subjects> arguments
372 # determines the size of each sample (default equals to the
373 # number of individuals in the original data set). The method
374 # returns references to three arrays: I<boot_samples_ref>,
375 # which holds the bootstrap data sets, I<incl_individuals_ref>
376 # which holds arrays containing the subject identifiers (ID's)
377 # for the included individuals of each bootstrap data set and
378 # I<included_keys_ref> which holds the key or index of the
379 # included individuals. The key or index is an integer
380 # starting at 1 for the first individual in the original data
381 # set and increasing by one for each following.
382 $self -> synchronize;
383 my @header = @{$self -> {'header'}};
384 my $individuals = $self -> {'individuals'};
385 my $key_ref;
387 my $status_bar = status_bar -> new( steps => $samples );
388 ui -> print( category => 'bootstrap',
389 message => $status_bar -> print_step,
390 newline => 0);
392 for ( my $i = 1; $i <= $samples; $i++ ) {
393 my $new_name = defined $name_stub ? $name_stub."_$i.dta" : "bs$i.dta";
394 $new_name = $directory.'/'.$new_name;
395 my ( $boot, $incl_ind_ref, $incl_key_ref ) =
396 $self -> resample( subjects => $subjects,
397 resume => $resume,
398 new_name => $new_name,
399 target => $target,
400 stratify_on => $stratify_on,
401 model_id => $model_ids[$i-1] );
402 push( @included_keys, $incl_key_ref );
403 push( @incl_individuals, $incl_ind_ref );
404 # $boot -> renumber_ascending;
405 push( @boot_samples, $boot );
406 # $boot -> synchronize;
407 # $boot -> flush;
408 if( $status_bar -> tick() ){
409 ui -> print( category => 'bootstrap',
410 message => $status_bar -> print_step,
411 newline => 0,
412 wrap => 0);
415 ui -> print( category => 'bootstrap',
416 message => ' ... done' );
418 end bootstrap
420 # }}} bootstrap
422 # {{{ resample
424 start resample
426 $self -> synchronize;
427 my ( @header, $individuals, @bs_inds, $key_ref, @id_ids, @bs_id_ids );
428 @id_ids = @{$self -> {'individual_ids'}} if( defined $self -> {'individual_ids'} );
429 if ( defined $stratify_on ) {
430 unless ( $resume and -e $new_name ) {
431 @header = @{$self -> {'header'}};
432 $individuals = $self -> {'individuals'};
433 my %strata;
434 if( $stratify_on =~ /\D/ ){
435 %strata = %{$self -> factors( column_head => $stratify_on )};
436 if ( $strata{'Non-unique values found'} eq '1' ) {
437 debug -> die( message => "Individuals were found to have multiple values in the $stratify_on column. ".
438 "The column $stratify_on cannot be used for stratification of the resampling." );
440 } else {
441 %strata = %{$self -> factors( column => $stratify_on )};
442 if ( $strata{'Non-unique values found'} eq '1' ) {
443 debug -> die( message => "Individuals were found to have multiple values in column number $stratify_on. ".
444 "Column $stratify_on cannot be used for stratification of the resampling." );
448 while( my ( $factor, $key_list ) = each %strata ) {
449 my $keys = scalar @{$key_list};
450 for ( my $i = 0; $i < $keys; $i++ ) {
451 my $list_ref = random_uniform_integer(1,0,$keys-1);
452 push( @bs_inds, $individuals ->
453 [ $key_list -> [$list_ref] ] -> copy );
454 push( @included_keys, $key_list -> [$list_ref] );
455 push( @incl_individuals, $individuals ->
456 [ $key_list -> [$list_ref] ] -> idnumber );
457 push( @bs_id_ids, $id_ids[ $key_list -> [$list_ref] ] );
461 $boot = data -> new( header => \@header,
462 idcolumn => $self -> {'idcolumn'},
463 ignoresign => $self -> {'ignoresign'},
464 individuals => \@bs_inds,
465 filename => $new_name,
466 ignore_missing_files => 1,
467 target => 'mem' );
468 $boot -> renumber_ascending;
469 $boot -> _write;
470 $boot -> flush;
471 #$boot -> target( $target );
473 } else {
474 unless ( $resume and -e $new_name ) {
475 @header = @{$self -> {'header'}};
476 $individuals = $self -> {'individuals'};
477 for ( my $i = 1; $i <= $subjects; $i++ ) {
478 $key_ref = random_uniform_integer(1,0,scalar @{$individuals}-1);
479 push( @bs_inds, $individuals -> [ $key_ref ] -> copy );
480 push( @included_keys, $key_ref );
481 push( @incl_individuals, $individuals -> [ $key_ref ] -> idnumber );
482 push( @bs_id_ids, $id_ids[ $key_ref ] );
485 # MUST FIX: If a file already exists with the same name,
486 # the created bs data set will be appended to this. IT
487 # MUST BE OVERWRITTEN!
488 $boot = data -> new( header => \@header,
489 idcolumn => $self -> {'idcolumn'},
490 ignoresign => $self -> {'ignoresign'},
491 individuals => \@bs_inds,
492 filename => $new_name,
493 ignore_missing_files => 1,
494 target => 'mem' );
495 $boot -> renumber_ascending;
496 $boot -> _write;
497 $boot -> target( $target );
498 } else {
499 # If we are resuming, we still need to generate the
500 # pseudo-random sequence and initiate a data object
501 for ( my $i = 1; $i <= $subjects; $i++ ) {
502 random_uniform_integer(1,0,scalar @{$individuals}-1)
504 $boot = data -> new( idcolumn => $self -> {'idcolumn'},
505 ignoresign => $self -> {'ignoresign'},
506 filename => $new_name,
507 ignore_missing_files => 1,
508 target => $target );
509 $boot -> _write;
510 $boot -> flush;
512 if( $target eq 'disk'){
513 $boot -> flush;
516 $boot -> register_in_database( individual_ids => \@bs_id_ids,
517 resampled => 1,
518 model_id => $model_id );
520 end resample
522 # }}} resample
524 # {{{ case_deletion
526 start case_deletion
528 # case_deletion creates subsets of the data. The number of
529 # subsets is specified by the bins argument. The individuals
530 # of each subset is selected randomly or in ascending
531 # numerical order depending on the selection argument that can
532 # be either 'consecutive' or 'random'. case_column must be
533 # specified to give the method something to base the selection
534 # on. Valid case_column values are either the column number
535 # (pure digits) or the name of the column in the (optional)
536 # header row.
537 $self -> synchronize;
538 my @header = @{$self -> {'header'}};
539 if ( not defined $case_column ) {
540 debug -> die( message => "case_column must be specified" );
541 } else {
542 if ( not $case_column =~ /^\d/ ) {
543 for ( my $i = 0; $i <= $#header; $i++ ) {
544 $case_column = $i+1 if ( $header[$i] eq $case_column );
548 $bins = defined $bins ? $bins :
549 scalar keys %{$self -> factors( column => $case_column)};
550 my %factors = %{$self -> factors( column => $case_column )};
551 if ( $factors{'Non-unique values found'} eq '1' ) {
552 debug -> die( message => "Individuals were found to have multiple values in column number $case_column. ".
553 "Column $case_column cannot be used for case deletion." );
556 my $maxbins = scalar keys %factors;
557 my @ftrs = sort { $a <=> $b } keys %factors;
558 my $individuals = $self -> {'individuals'};
559 my $maxkey = scalar @{$individuals} - 1;
561 my ( @tmp_ftrs, @binsize ) =
562 ((),());
563 my ( $k, $j, $i ) = ( 0, 0, 0 );
564 # Create the binsizes
565 for ( $j = 0; $j < $maxbins; $j++ ) {
566 $binsize[ $k++ ]++;
567 $k = 0 if( $k >= $bins );
569 $self -> _fisher_yates_shuffle( array => \@ftrs ) if( $selection eq 'random' );
570 for ( $k = 0; $k < $bins; $k++ ) {
571 for ( $j = 0; $j < $binsize[ $k ]; $j++ ) {
572 # print "SK: ",$skipped_keys[ $k ]," F: ",$factors{ $ftrs[ $i ] },"\n";
573 push( @{$skipped_keys[ $k ]}, @{$factors{ $ftrs[ $i ] }} );
574 push( @{$skipped_values[ $k ]}, $ftrs[ $i++ ] );
578 for ( $k = 0; $k < $bins; $k++ ) {
579 my @cd_inds = ();
580 my @del_inds = ();
581 SELKEYS: foreach my $key ( 0..$maxkey ) {
582 foreach my $skipped ( @{$skipped_keys[ $k ]} ) {
583 if ( $key == $skipped ) {
584 push( @{$skipped_ids[ $k ]}, $individuals ->
585 [ $skipped ] -> idnumber );
586 push( @del_inds, $individuals -> [ $key ] -> copy );
587 next SELKEYS;
590 push( @cd_inds, $individuals -> [ $key ] -> copy );
592 # Set ignore_missing_files = 1 to make it possible to get the result
593 # in memory only
594 my $newdata = data ->
595 new ( header => \@header,
596 ignoresign => $self -> {'ignoresign'},
597 idcolumn => $self -> {'idcolumn'},
598 individuals => \@cd_inds,
599 target => $target,
600 filename => 'cd'.$k+1 .'.dta',
601 ignore_missing_files => 1 );
602 my $deldata = data ->
603 new ( header => \@header,
604 ignoresign => $self -> {'ignoresign'},
605 idcolumn => $self -> {'idcolumn'},
606 individuals => \@del_inds,
607 target => $target,
608 filename => 'del'.$k+1 .'.dta',
609 ignore_missing_files => 1 );
610 push( @subsets, $newdata );
611 push( @remainders, $deldata );
614 end case_deletion
616 # }}} case_deletion
618 # {{{ copy
619 start copy
621 # filename: new data file name.
623 # target: keep the copy in memory ('mem') or write it to disk and flush the memory ('disk').
625 ($directory, $filename) = OSspecific::absolute_path( $directory, $filename );
627 # Clone self into new data object. Why don't the individuals get cloned too?
628 # strange. need to set synced to 0 AND set the {'individuals'} to undef.
629 cp($self -> full_name, $directory.$filename );
630 $new_data = Storable::dclone( $self );
631 $new_data -> {'synced'} = 0;
632 $new_data -> {'individuals'} = undef;
633 $new_data -> synchronize;
635 # Set the new file name for the copy
636 $new_data -> directory( $directory );
637 $new_data -> filename( $filename );
639 end copy
641 # }}} copy
643 # {{{ count_ind
645 start count_ind
647 # Returns the number of individuals in the data set.
648 $self -> synchronize;
649 $num = scalar @{$self -> {'individuals'}};
651 end count_ind
653 # }}} count_ind
655 # {{{ diff
656 start diff
658 $self -> synchronize;
660 my $first_id = $self -> {'individuals'}[0];
662 debug -> die( message => "No individuals defined in data object based on ".
663 $self -> full_name ) unless ( defined $first_id );
665 # Check if $column(-index) is defined and valid, else try to find index
666 # using column_head
668 my @data_row = split( /,/, $first_id -> subject_data -> [0] );
669 if( $#columns >= 0 ) {
670 foreach my $column ( @columns ) {
671 unless ( defined $column && defined( $data_row[$column-1] ) ) {
672 debug -> die( message => "Error in data -> factors: ".
673 "invalid column number: \"$column\"\n".
674 "Valid column numbers are 1 to ".
675 scalar @{$first_id -> subject_data ->[0]}."\n" );
678 } elsif ( $#column_heads >= 0 ) {
679 foreach my $column_head ( @column_heads ) {
680 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
681 debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
682 "Valid column headers are (in no particular order):\n".
683 join(', ',keys(%{$self -> {'column_head_indices'}})) );
684 } else {
685 my $column = $self -> {'column_head_indices'}{$column_head};
686 push( @columns, $column );
687 debug -> warn( level => 2,
688 message => "$column_head is in column number $column" );
691 } else {
692 debug -> die( message => "No column or column_head defined" );
695 if( $global_largest or $global_smallest or
696 $largest_per_individual or $smallest_per_individual ) {
697 if( not scalar @{$self -> {'individuals'}} == scalar @{$against_data -> individuals} ) {
698 debug -> die( message => "Both data object must hold the same number of individuals ".
699 "and observations when calling data -> diff" );
701 for( my $i = 0; $i < scalar @{$self -> {'individuals'}}; $i++ ) {
702 my %id_diffs = %{$self -> {'individuals'}[$i] ->
703 diff( against_individual => $against_data -> individuals -> [$i],
704 columns => \@columns,
705 absolute_diff => $absolute_diff,
706 diff_as_fraction => $diff_as_fraction,
707 largest => ( $global_largest or $largest_per_individual ),
708 smallest => ( $global_smallest or $smallest_per_individual ) )};
709 if( $global_largest ) {
710 for( my $j = 0; $j <= $#columns; $j++ ) {
711 my $label = defined $column_heads[$j] ? $column_heads[$j] : $columns[$j];
712 if( not defined $diff_results{$label} or not defined $diff_results{$label}{'diff'} or
713 $id_diffs{$columns[$j]}{'diff'} > $diff_results{$label}{'diff'} ) {
714 $diff_results{$label}{'diff'} = $id_diffs{$columns[$j]}{'diff'};
715 $diff_results{$label}{'self'} = $id_diffs{$columns[$j]}{'self'};
716 $diff_results{$label}{'test'} = $id_diffs{$columns[$j]}{'test'};
721 } else {
722 die "data -> diff is only implemented for finding the largest difference at any observation at this point\n";
725 end diff
726 # }}} diff
728 # {{{ filename
729 start filename
731 if ( defined $parm and $parm ne $self -> {'filename'} ) {
732 $self -> {'filename'} = $parm;
733 $self -> {'data_id'} = undef;
734 # $self -> _write;
737 end filename
738 # }}} filename
740 # {{{ fractions
742 start fractions
744 my %factors = $self -> factors( 'return_occurences' => 1,
745 'unique_in_individual' => $unique_in_individual,
746 'column_head' => $column_head,
747 'column' => $column);
749 my $sum = 0;
750 while (my ($factor, $amount) = each %factors) {
751 if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
752 next;
753 } else {
754 $sum += $amount;
757 while (my ($factor, $amount) = each %factors) {
758 if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
759 next;
760 } else {
761 $fractions{$factor} = $amount/$sum;
765 end fractions
767 # }}} fractions
769 # {{{ factors
771 start factors
773 # Either column (number, starting at 1) or column_head must be specified.
775 # The default behaviour is to return a hash with the factors as keys
776 # and as values references to arrays with the order numbers (not the ID numbers)
777 # of the individuals that contain this factor
779 # If unique_in_individual is true (1), the returned hash will contain
780 # an element with key 'Non-unique values found' and value 1 if any
781 # individual contain more than one value in the specified column.
783 # Return occurences will calculate the occurence of each
784 # factor value. Several occurences in one individual counts as
785 # one occurence. The elements of the returned hash will have the factors
786 # as keys and the number of occurences as values.
789 $self -> synchronize;
791 # Check if $column(-index) is defined and valid, else try to find index
792 # using column_head
793 my $first_id = $self -> {'individuals'}[0];
795 debug -> die( message => "No individuals defined in data object based on ".
796 $self -> full_name ) unless ( defined $first_id );
798 my @data_row = split( /,/, $first_id -> subject_data -> [0] );
799 unless ( defined $column && defined( $data_row[$column-1] ) ) {
800 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
801 debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
802 "or invalid column number: \"$column\".\n".
803 "Valid column numbers are 1 to ".scalar @data_row ."\n".
804 "Valid column headers are (in no particular order):\n".
805 join(', ',keys(%{$self -> {'column_head_indices'}})) );
806 } else {
807 $column = $self -> {'column_head_indices'}{$column_head};
808 debug -> warn( level => 2,
809 message => "$column_head is in column number $column" );
813 my $key = 0;
814 foreach my $individual ( @{$self -> {'individuals'}} ) {
815 my @ifactors = keys %{$individual -> factors( column => $column )};
816 if ( scalar @ifactors > 1 and $unique_in_individual ) {
817 %factors = ( 'Non-unique values found' => 1 );
818 last;
820 debug -> die( message => "No value found in column $column in individual ".
821 $individual -> idnumber ) if ( scalar @ifactors == 0 );
823 # Return occurences will calculate the occurence of each
824 # factor value. Several occurences in one individual counts as
825 # one occurence.
827 if ( $return_occurences ) {
828 foreach my $ifactor ( @ifactors ) {
829 $factors{$ifactor}++;
831 } else {
832 foreach my $ifactor ( @ifactors ) {
833 push( @{$factors{$ifactor}}, $key );
836 $key++;
839 end factors
841 # }}} factors
843 # {{{ find_individual
845 # start find_individual
846 # foreach my $tmp_ind ( @{$self -> individuals} ) {
847 # if ( $tmp_ind -> key == $key ) {
848 # $individual = $tmp_ind;
849 # last;
852 # if ( defined $individual ) {
853 # if ( $copy ) {
854 # $individual = $individual -> copy;
856 # } else {
857 # print "No individual with key $key found in call to ".
858 # "data -> find_individual\n" if ( $self -> debug );
860 # end find_individual
862 # }}}
864 # {{{ format_data
866 start format_data
868 my $header = $self -> {'header'};
870 # format the data for NONMEM (simple comma-separated layout)
871 if ( defined $self -> {'comment'} ) {
872 my @comment = @{$self -> {'comment'}};
873 for ( @comment ) {
874 push( @form_data );
878 my $wrap = ( defined $self -> {'wrap_column'} and
879 defined $self -> {'cont_column'} );
881 my @primary_columns = defined $self -> {'primary_columns'} ?
882 @{$self -> {'primary_columns'}} : ();
883 my @secondary_columns = defined $self -> {'secondary_columns'} ?
884 @{$self -> {'secondary_columns'}} : ();
885 if ( defined $header and defined $self -> {'ignoresign'} ) {
886 my $istr;
887 if ( $self -> {'ignoresign'} ne '@' ) {
888 $istr = $self -> {'ignoresign'};
890 if ( $wrap ) {
891 my @h_data;
892 for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
893 my $sstr = $istr;
894 for ( my $j = 0; $j < scalar @{$secondary_columns[$i]} ; $j++ ) {
895 my $jstr = $j == 0 ? '' : ',';
896 $sstr = $sstr.$jstr.$secondary_columns[$i][$j][0];
898 push( @h_data, $sstr."\n" );
900 push( @form_data, @h_data );
901 my $pstr = $istr;
902 for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
903 my $jstr = $i == 0 ? '' : ',';
904 $pstr = $pstr.$jstr.$primary_columns[$i][0];
906 push( @form_data, $pstr."\n" );
907 } else {
908 push( @form_data, $istr.join(',',@{$self -> {'header'}})."\n" );
911 if ( $wrap ) {
912 foreach my $individual ( @{$self -> {'individuals'}} ) {
913 foreach my $row ( @{$individual -> subject_data} ) {
914 my @r_data;
915 for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
916 my $sstr = '';
917 for ( my $j = 0; $j < scalar @{$secondary_columns[$i]} ; $j++ ) {
918 my $jstr = $j == 0 ? '' : ',';
919 if ( $secondary_columns[$i][$j][0] eq 'CONT' ) {
920 $sstr = $sstr.$jstr.'1';
921 } else {
922 my @data_row = split( /,/, $row );
923 $sstr = $sstr.$jstr.$data_row[$secondary_columns[$i][$j][1]];
926 push( @r_data, $sstr."\n" );
928 push( @form_data, @r_data );
929 my $pstr = '';
930 for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
931 my $jstr = $i == 0 ? '' : ',';
932 if ( $primary_columns[$i][0] eq 'CONT' ) {
933 $pstr = $pstr.$jstr.'0';
934 } else {
935 my @data_row = split( /,/, $row );
936 $pstr = $pstr.$jstr.$data_row[$primary_columns[$i][1]];
939 push( @form_data, $pstr."\n" );
942 } else {
943 foreach my $individual ( @{$self -> {'individuals'}} ) {
944 foreach my $row ( @{$individual -> subject_data} ) {
945 push( @form_data, $row ."\n" );
950 end format_data
952 # }}} format_data
954 # {{{ drop_dropped
956 start drop_dropped
958 # This method removes columns that has '=DROP' value in the
959 # model header as given by $INPUT. The model header must be
960 # transfered to this method through the model_header
961 # argument. The model_header argument should be a
962 # two-dimensional array where each position in the first
963 # dimension should be a reference to a 1*2 array holding the
964 # column name and value. Any ignore-sign must be removed.
966 debug -> die( message => 'model header must be defined' )
967 if ( $#model_header < 0 );
968 # Important that the drop_dropped method of the model::problem
969 # class is in sync with this method.
970 $self -> synchronize;
972 $self -> {'header'} = [];
973 my @drop;
974 my $counter = 1;
975 for( my $i = 0; $i <= $#model_header; $i++ ) {
976 $self -> {'idcolumn'} = $counter if ( $model_header[$i][0] eq 'ID' );
977 if( $model_header[$i][1] eq 'DROP' and
978 not $model_header[$i][0] =~ /DAT(E|1|2|3)/ ) {
979 push( @drop, 1 );
980 } else {
981 $counter++;
982 push( @drop, 0 );
983 push( @{$self -> {'header'}}, $model_header[$i][0] );
987 foreach my $individual ( @{$self -> {'individuals'}} ) {
988 $individual -> drop_columns( drop => \@drop );
991 $self -> {'synced'} = 0;
992 # $Data::Dumper::Maxdepth = 2;
993 # die Dumper $self;
994 # die Dumper $self -> {'individuals'};
996 end drop_dropped
998 # }}} drop_dropped
1000 # {{{ wrap
1001 start wrap
1003 $self -> synchronize;
1004 $self -> cont_column( $cont_column ) if ( defined $cont_column );
1005 $self -> wrap_column( $wrap_column ) if ( defined $wrap_column );
1006 $self -> prepare_wrap( model_header => \@model_header );
1007 @secondary_columns = @{$self -> {'secondary_columns'}}
1008 if ( defined $self -> {'secondary_columns'} );
1009 @primary_columns = @{$self -> {'primary_columns'}}
1010 if ( defined $self -> {'primary_columns'} );
1012 end wrap
1013 # }}} wrap
1015 # {{{ unwrap
1016 start unwrap
1018 $self -> {'cont_column'} = undef;
1019 $self -> {'wrap_column'} = undef;
1020 $self -> {'secondary_columns'} = undef;
1021 $self -> {'primary_columns'} = undef;
1023 end unwrap
1024 # }}} unwrap
1026 # {{{ prepare_wrap
1028 start prepare_wrap
1030 my $cont_column = $self -> {'cont_column'};
1031 my $wrap_column = $self -> {'wrap_column'};
1032 debug -> die( message => 'cont_column ('.$cont_column.') must be less or equal '.
1033 'to the requested number of columns in each row ('.
1034 ($wrap_column).')' )
1035 if ( $cont_column > $wrap_column );
1036 my @header;
1037 if ( scalar @model_header > 0 ) {
1038 @header = @model_header;
1039 } else {
1040 @header = @{$self -> {'header'}};
1043 my ( @primary, @secondary, @date_columns );
1045 for ( my $i = 0; $i <= $#header; $i++ ) {
1046 my $name = ref( $header[$i] ) eq 'ARRAY' ? $header[$i][0] : $header[$i];
1047 my $value = ref( $header[$i] ) eq 'ARRAY' ? $header[$i][1] : undef;
1048 next if ( $name eq 'ID' );
1049 my $found = 0;
1050 foreach my $prim ( @primary_column_names ) {
1051 if ( not $found and
1052 ( $name eq $prim or $value eq $prim ) ) {
1053 push( @primary, [$name, $i, $value] );
1054 $found = 1;
1055 my $col = ($#primary+2)>= $cont_column ? ($#primary+3) : ($#primary+2);
1056 push( @date_columns, $col ) if ( $name =~ /DAT(E|1|2|3)/ );
1059 push( @secondary, [$name, $i, $value] ) if ( not $found );
1062 my $prim_num = scalar @primary;
1063 debug -> die( message => 'The number of primary columns (that need to '.
1064 'be part of the row with CONT=0) ('.($prim_num+1).
1065 ') is larger than the required number of columns (wrap_column='.
1066 $wrap_column.') - 1' )
1067 if ( scalar $prim_num > ($wrap_column-2) );
1069 my ( $i, $dum ) = ( 0, 1 );
1070 my @tmp;
1071 for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1072 if( $j == 1 ) {
1073 push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1074 } elsif ( $j == $wrap_column ) {
1075 if ( $j == $cont_column ) {
1076 push( @tmp, ['CONT', undef] );
1077 } else {
1078 my $val;
1079 if ( defined $primary[$i] ) {
1080 $val = $primary[$i];
1081 } elsif ( defined $secondary[0] ) {
1082 $val = shift(@secondary);
1083 } else {
1084 $val = ['XX'.$dum++,$self -> {'idcolumn'}-1];
1086 push( @tmp, $val );
1087 $i++;
1089 push( @{$self -> {'primary_columns'}}, @tmp );
1090 } else {
1091 if ( $j == $cont_column ) {
1092 push( @tmp, ['CONT', undef] );
1093 } else {
1094 if ( $i <= $#primary ) {
1095 push( @tmp, $primary[$i] );
1096 $i++;
1097 } else {
1098 my $val = defined $secondary[0] ? shift(@secondary) :
1099 ['XX'.$dum++,$self -> {'idcolumn'}-1];
1100 push( @tmp, $val );
1106 my $i = 0;
1107 while ( $i <= $#secondary ) {
1108 my @tmp;
1109 for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1110 if( $j == 1 ) {
1111 push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1112 } elsif ( $j == $wrap_column ) {
1113 if ( $j == $cont_column ) {
1114 push( @tmp, ['CONT', undef] );
1115 } else {
1116 my $val = defined $secondary[$i] ? $secondary[$i] :
1117 ['XX'.$dum++,$self -> {'idcolumn'}-1];
1118 push( @tmp, $val );
1119 $i++;
1121 unshift( @{$self -> {'secondary_columns'}}, \@tmp );
1122 } else {
1123 if ( $j == $cont_column ) {
1124 push( @tmp, ['CONT', undef] );
1125 } else {
1126 my $isdate = 0;
1127 if ( $#date_columns >= 0 ) {
1128 foreach my $col ( @date_columns ) {
1129 # This is a date column which may have to be dropped
1130 # and thus will not appear as a secondary
1131 # column. Nothing should be pushed. The indexes in
1132 # model::problem::pk::_format_record will be ok.
1133 $isdate = 1 if ( $col == $j ) ;
1136 if ( $isdate ) {
1137 push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1138 } else {
1139 if ( $i <= $#secondary ) {
1140 push( @tmp, $secondary[$i] );
1141 $i++;
1142 } else {
1143 push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1151 end prepare_wrap
1153 # }}} prepare_wrap
1155 # {{{ have_missing_data
1156 start have_missing_data
1158 # Either I<column> or I<column_head> must be specified.
1160 # This method looks through the data column with index I<column> or
1161 # (optional) header name I<column_head> and returns O if no missing
1162 # data indicator was found or 1 otherwise.
1164 $self -> synchronize;
1165 my $first_id = $self -> {'individuals'}[0];
1166 debug -> die( message => "No individuals defined in data object based on ".
1167 $self -> full_name ) unless ( defined $first_id );
1168 my @data_row = split( /,/ , $first_id -> subject_data -> [0] );
1169 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1170 unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1171 die "Error in data -> have_missing_data: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1172 } else {
1173 $column = $self -> {'column_head_indices'}{$column_head};
1176 $self -> flush if ( $self -> {'target'} eq 'disk' );
1178 # In case anyone wonders, the ternary statment ( bool ? true :
1179 # false ) below will possibly make a minuscle memory
1180 # optimization. But hey, why not :)
1182 $return_value = defined $self -> {'have_missing_data'} ? $self -> {'have_missing_data'} -> {$column} : 0;
1184 end have_missing_data
1185 # }}} have_missing_data
1187 # {{{ merge
1188 start merge
1190 #$self -> synchronize;
1191 push( @{$self -> {'individuals'}}, @{$mergeobj -> individuals} );
1193 end merge
1194 # }}} merge
1196 # {{{ max
1198 start max
1200 # Either column or column_head must be specified. Column_head must be a string that
1201 # identifies a column in the (optional ) data file header.
1203 # The if-statement below used to be a cache of allready calculated
1204 # means. But since individuals can be accessed in so many ways, we
1205 # don't know when this cache should be updated. Its easier to
1206 # recalculate the max. Maybe we can include this optimization in the
1207 # future, if it turns out to be a bottleneck
1208 # my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1209 # if ( defined $self -> {'max'}[$tmp_column] ) {
1210 # $return_value = $self -> {'max'}[$tmp_column] ;
1211 # } else {
1212 $self -> synchronize;
1213 my $first_id = $self -> {'individuals'}[0];
1214 debug -> die( message => "data -> max: No individuals defined in data object based on " .
1215 $self -> full_name ) unless defined $first_id;
1217 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1219 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1220 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1221 die "Error in data -> max: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1222 } else {
1223 $column = $self -> {'column_head_indices'}{$column_head};
1226 foreach my $individual ( @{$self -> {'individuals'}} ) {
1227 my $ifactors = $individual -> factors( 'column' => $column );
1228 foreach ( keys %{$ifactors} ) {
1229 next if ( $_ == $self -> {'missing_data_token'} );
1230 if ( defined ($return_value) ) {
1231 $return_value = $_ > $return_value ? $_ : $return_value;
1232 } else {
1233 $return_value = $_;
1238 # $self -> {'max'}[$column] = $return_value;
1239 $self -> flush if ( $self -> {'target'} eq 'disk' );
1242 end max
1244 # }}} max
1246 # {{{ min
1248 start min
1250 # See L</max>.
1251 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1253 # The if-statement below used to be a cache of allready calculated
1254 # means. But since individuals can be accessed in so many ways, we
1255 # don't know when this cache should be updated. Its easier to
1256 # recalculate the min. Maybe we can include this optimization in the
1257 # future, if it turns out to be a bottleneck
1258 # if ( defined $self -> {'min'}[$tmp_column] ) {
1259 # $return_value = $self -> {'min'}[$tmp_column] ;
1260 # } else {
1261 $self -> synchronize;
1262 my $first_id = $self -> {'individuals'}[0];
1263 die "data -> min: No individuals defined in data object based on ",
1264 $self -> full_name,"\n" unless defined $first_id;
1266 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1268 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1269 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1270 die "Error in data -> min: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1271 } else {
1272 $column = $self -> {'column_head_indices'}{$column_head};
1275 foreach my $individual ( @{$self -> {'individuals'}} ) {
1276 my $ifactors = $individual -> factors( 'column' => $column );
1277 foreach ( keys %{$ifactors} ) {
1278 next if ( $_ == $self -> {'missing_data_token'} );
1279 if ( defined ($return_value) ) {
1280 $return_value = $_ < $return_value ? $_ : $return_value;
1281 } else {
1282 $return_value = $_;
1286 # $self -> {'min'}[$column] = $return_value;
1287 $self -> flush if ( $self -> {'target'} eq 'disk' );
1290 end min
1292 # }}} min
1294 # {{{ median
1296 start median
1298 # See L</max>.
1299 $self -> synchronize;
1300 my $first_id = $self -> {'individuals'}[0];
1301 die "data -> median: No individuals defined in data object based on ",
1302 $self -> full_name,"\n" unless defined $first_id;
1304 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1306 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1307 unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1308 die "Error in data -> median: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1309 } else {
1310 $column = $self -> {'column_head_indices'}{$column_head};
1314 if( defined $self -> {'median'}[$column] ){
1315 return $self -> {'median'}[$column];
1318 my @median_array;
1320 foreach my $individual ( @{$self -> {'individuals'}} ) {
1321 if( $unique_in_individual ){
1322 my $ifactors = $individual -> factors( 'column' => $column );
1324 foreach ( keys %{$ifactors} ) {
1325 next if ( $_ == $self -> {'missing_data_token'} );
1326 push( @median_array, $_ );
1328 } else {
1329 my $ifactors = $individual -> subject_data;
1331 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1332 my @data_row = split( /,/ , $ifactors -> [$i] );
1333 next if ( $data_row[$column-1] == $self -> {'missing_data_token'} );
1334 push(@median_array, $data_row[$column-1]);
1338 @median_array = sort {$a <=> $b} @median_array ;
1340 if( @median_array % 2 ){
1341 $return_value = $median_array[$#median_array / 2];
1342 } else {
1343 $return_value = ( $median_array[@median_array / 2] +
1344 $median_array[(@median_array - 2) / 2] ) / 2;
1347 $self -> {'median'}[$column] = $return_value;
1349 end median
1351 # }}} median
1353 # {{{ mean
1355 start mean
1357 # Returns mean value of a column
1358 # If a individual contains more then 1 value (i.e. if an
1359 # individual has different values in different samples a mean
1360 # value of all individuals if calculate first, then the mean
1361 # value of the column If hi_cutoff is defined the mean function
1362 # will cut all value below the cutoff, and set their value to
1363 # 0. It's used to calculate the HI-mean/LOW-mean of a column for
1364 # e.g. Hockey-stick covariates If both hi_cutoff and low_cutoff
1365 # are defined only the hi_cutoff will be used. See L</max>.
1366 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1367 $self -> synchronize;
1368 my $first_id = $self -> {'individuals'}[0];
1369 die "data -> mean: No individuals defined in data object based on ",
1370 $self -> full_name,"\n" unless defined $first_id;
1372 my @data_row = split( /,/, $first_id -> subject_data ->[0] );
1374 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1375 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1376 die "Error in data -> mean: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1377 } else {
1378 $column = $self -> {'column_head_indices'}{$column_head};
1382 ## Here the calculation starts
1383 my $num_individuals = 0;
1384 my $sum = 0;
1386 my $all_data_rows=0;
1387 foreach my $individual ( @{$self ->{'individuals'}} ) {
1389 my $ifactors = $individual -> subject_data;
1390 my $individual_sum = 0;
1391 my $data_rows = 0;
1392 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1394 # data is stored in strings. We need to split them into an
1395 # array.
1397 my @data_row = split( /,/, $ifactors -> [$i] );
1398 if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1399 # print "Skipping row with missing data\n";
1400 next;
1403 if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1404 # print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1405 next;
1408 if (defined $hi_cutoff) {
1409 if ($data_row[$column-1]>$hi_cutoff) {
1410 $individual_sum += $data_row[$column-1]-$hi_cutoff;
1413 else {
1414 if (defined $low_cutoff) {
1415 if ($data_row[$column-1]<$low_cutoff) {
1416 $individual_sum += $low_cutoff - $data_row[$column-1];
1419 else {
1420 $individual_sum += $data_row[$column-1];
1423 $data_rows++;
1425 if( $global_mean ) {
1426 $sum += $individual_sum;
1427 $num_individuals += $data_rows;
1428 } else {
1429 if( $data_rows != 0 ) {
1430 $sum += $individual_sum/$data_rows;
1432 $num_individuals ++;
1434 $all_data_rows += $data_rows;
1436 if( $num_individuals != 0 ) {
1437 $return_value = $sum / $num_individuals;
1439 # print "DR: $all_data_rows\n";
1440 # print "\nNIM: $num_individuals $return_value\n";
1443 end mean
1445 # }}} mean
1447 # {{{ sd
1449 start sd
1451 # This sub returns standard deviation for a specific column
1452 # If there are more than one sample/individual the value used for that specific
1453 # individual is the mean value of its samples.
1454 # The cut-offs are for hockey stick variables. I.e. If one individual value is
1455 # lower than the hi-cutoff the individual value will be zero.
1456 # HI_cutoff is used to calculate the HI-mean of a column.
1457 # If cut_off is undef it won't be used
1458 # See L</max>.
1459 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1460 $self -> synchronize;
1461 my $first_id = $self -> {'individuals'}[0];
1462 debug -> die( message => "No individuals defined in data object based on ".
1463 $self -> full_name ) unless defined $first_id;
1465 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1467 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1468 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1469 debug -> die( message => "Unknown column: \"$column_head\" or "
1470 ."invalid column number: \"$column\"" );
1471 } else {
1472 $column = $self -> {'column_head_indices'}{$column_head};
1476 ## Here the calculation starts
1477 my $num_individuals = 0;
1478 my $sum = 0;
1479 my $mean;
1480 if (defined $hi_cutoff) {
1481 $mean = $self->mean(column => $column,
1482 hi_cutoff => $hi_cutoff,
1483 global_mean => $global_sd );
1484 } elsif (defined $low_cutoff) {
1485 $mean = $self->mean(column => $column,
1486 low_cutoff => $low_cutoff,
1487 global_mean => $global_sd );
1488 } else {
1489 $mean = $self->mean( column => $column,
1490 subset_column => $subset_column,
1491 subset_syntax => $subset_syntax,
1492 global_mean => $global_sd );
1495 foreach my $individual ( @{$self -> {'individuals'}} ) {
1496 my $ifactors = $individual -> subject_data;
1497 my $individual_sum = 0;
1498 my $data_rows = 0;
1499 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1501 # data is stored in strings. We need to split them into an
1502 # array.
1504 my @data_row = split( /,/, $ifactors -> [$i] );
1506 if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1507 # print "Skipping row with missing data\n";
1508 next;
1511 if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1512 # print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1513 next;
1516 if (defined $hi_cutoff) {
1517 if ($ifactors->[$i]->[$column-1]>$hi_cutoff) {
1518 if( $global_sd ) {
1519 $individual_sum += ($data_row[$column-1] - $hi_cutoff - $mean) ** 2;
1520 } else {
1521 $individual_sum += $data_row[$column-1]-$hi_cutoff;
1524 } else {
1525 if (defined $low_cutoff) {
1526 if ($ifactors->[$i]->[$column-1]<$low_cutoff) {
1527 if( $global_sd ) {
1528 $individual_sum += ($low_cutoff - $data_row[$column-1] - $mean) ** 2;
1529 } else {
1530 $individual_sum += $low_cutoff - $data_row[$column-1];
1533 } else {
1534 if( $global_sd ) {
1535 $individual_sum += ($data_row[$column-1] - $mean) ** 2;
1536 } else {
1537 $individual_sum += $data_row[$column-1];
1541 $data_rows++;
1543 if( $global_sd ) {
1544 $sum += $individual_sum;
1545 $num_individuals += $data_rows;
1546 } else {
1547 if( $data_rows != 0 ) {
1548 $sum += ($individual_sum/$data_rows - $mean) ** 2;
1550 $num_individuals++;
1553 if( $num_individuals < 2 ) {
1554 $return_value = 0;
1555 } else {
1556 if( $num_individuals != 0 ) {
1557 $return_value = (1/($num_individuals-1)*$sum) ** 0.5;
1562 end sd
1564 # }}} sd
1566 # {{{ range
1567 start range
1569 # See L</max>.
1570 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1571 if ( defined $self -> {'range'}[$tmp_column] ) {
1572 $return_value = $self -> {'range'}[$tmp_column];
1573 } else {
1574 my $old_target = $self -> {'target'};
1575 $self -> {'target'} = 'mem';
1576 $self -> synchronize;
1577 $return_value = $self -> max( column => $column,
1578 column_head => $column_head ) -
1579 $self -> min( column => $column,
1580 column_head => $column_head );
1581 $self -> {'range'}[$column] = $return_value;
1582 if ( $old_target eq 'disk' ) {
1583 $self -> flush if ( $self -> {'target'} eq 'disk' );
1584 $self -> {'target'} = 'disk';
1588 end range
1589 # }}} range
1591 # {{{ recalc_column
1592 start recalc_column
1594 # Recalculates a column based on expression. Also, see L</max>.
1595 $self -> synchronize;
1597 # Check if $column(-index) is defined and valid, else try to find index using column_head
1598 my $first_id = $self -> {'individuals'}[0];
1599 die "data -> recalc_column: No individuals defined in data object based on ",
1600 $self -> full_name,"\n" unless defined $first_id;
1602 my @data_row = split( /,/ , $first_id -> subject_data ->[0] );
1604 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1605 if(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1606 die "Error in data -> recalc_column: unknown column: \"$column_head\" or column number: \"$column\"\n";
1607 } else {
1608 $column = $self -> {'column_head_indices'}{$column_head};
1612 for my $individual ( @{$self -> {'individuals'}} ) {
1613 $individual -> recalc_column( column => $column,
1614 expression => $expression );
1617 end recalc_column
1618 # }}} recalc_column
1620 # {{{ renumber_ascending
1622 start renumber_ascending
1624 # Renumbers the individuals (changes the subject identifiers) so that
1625 # all have unique integer numbers starting with start_at and
1626 # ascending. The primary use of this
1627 # method is not to order the individuals after their identifiers but to
1628 # ensure that all individuals have unique identifiers.
1630 $self -> synchronize;
1631 foreach my $individual ( @{$self -> {'individuals'}} ) {
1632 $individual -> idnumber ( $start_at++ );
1634 $self -> {'synced'} = 0;
1636 end renumber_ascending
1638 # }}} renumber_ascending
1640 # {{{ renumber_descending
1642 start renumber_descending
1644 # See L</renumber_ascending>.
1645 $self -> synchronize;
1646 foreach my $individual ( @{$self -> {'individuals'}} ) {
1647 $individual -> idnumber ( $start_at-- );
1649 $self -> {'synced'} = 0;
1651 end renumber_descending
1653 # }}} renumber_descending
1655 # {{{ single_valued_data
1657 start single_valued_data
1659 # Usage:
1661 # ($single_value_data_set, $remainder, $column_indexes) =
1662 # $data_object -> single_valued_data( subset_name => 'subset.dta',
1663 # remainder_name => 'remainder.dta',
1664 # target => 'disk',
1665 # do_not_test_columns => [1..18,24,26];
1667 # my $single_value_column_indexes = $column_indexes -> [0];
1668 # my $all_other_column_indexes = $column_indexes -> [1];
1670 # Analyses the content of each column, based on the
1671 # ID column, and returns two new data objects: One
1672 # that contains all columns that is has only one value per
1673 # individual and one that contains the
1674 # remainding data. This is useful for creating compact 'extra'
1675 # data sets that can be read in via user-defined sub-routines
1676 # when the number of columns needed exceeds the maximum that
1677 # NONMEM allows (e.g. 20 in NONMEM version V).
1679 # The I<do_not_test_columns> argument specifies on which columns
1680 # to skip the single value test
1682 my @multi_value_flags;
1683 my @individuals = @{$self -> {'individuals'}};
1684 # Initiate the flags:
1685 if ( defined $individuals[0] ) {
1686 my @data = @{$individuals[0] -> {'subject_data'}};
1687 my @data_row = split( /,/ , $data[0] );
1688 for ( my $i = 0; $i < scalar @data_row; $i++ ) {
1689 my $dnt_flag = 0;
1690 foreach my $dntc ( @do_not_test_columns ) {
1691 $dnt_flag = 1 if ( $i == $dntc - 1 );
1693 $multi_value_flags[$i] = $dnt_flag;
1695 } else {
1696 die "data -> single_valued_data: No data in ID number 1\n";
1698 # Collect the stats
1699 for ( my $id = 0; $id <= $#individuals; $id++ ) {
1700 my @data = @{$individuals[$id] -> {'subject_data'}};
1701 my @data_row = split( /,/, $data[0] );
1702 for ( my $j = 0; $j < scalar @data_row; $j++ ) {
1703 my %col_unique;
1704 for ( my $i = 0; $i <= $#data; $i++ ) {
1705 my @data_row = split( /,/ , $data[$i] );
1706 $col_unique{$data_row[$j]}++;
1708 my $factors = scalar keys %col_unique;
1709 $multi_value_flags[$j]++ if ( $factors > 1 );
1712 for ( my $i = 0; $i <= $#multi_value_flags; $i++ ) {
1713 if ( $multi_value_flags[$i] ) {
1714 push ( @{$column_indexes[1]}, $i + 1);
1715 } else {
1716 push ( @{$column_indexes[0]}, $i + 1);
1719 ( $single_value_data_set, $remainder ) =
1720 $self -> subset_vertically( column_indexes => $column_indexes[0],
1721 subset_name => $subset_name,
1722 return_remainder => 1,
1723 remainder_name => $remainder_name,
1724 target => $target,
1725 keep_first_row_only => 1);
1727 end single_valued_data
1729 # }}}
1731 # {{{ subset_vertically
1733 start subset_vertically
1735 # Usage:
1737 # $subset = $data_object -> subset_vertically ( column_indexes => [1,2,6],
1738 # subset_name => 'subset.dta' );
1740 # This basic usage returns a new data object containing
1741 # columns 1,2 and 6 from the original data plus the
1742 # idcolumn. The new data object will be associated with the
1743 # file 'subset.dta'.
1745 # You get the remaining data, i.e. the original data minus
1746 # the created subset by specifying
1748 # ( $subset, $remainder ) =
1749 # $data_object -> subset_vertically ( column_indexes => [1,2,6],
1750 # subset_name => 'subset.dta',
1751 # return_remainder => 1,
1752 # remainder_name => 'remainder.dta' );
1754 # If you would like to flush the created data sets to disk and
1755 # save memory, set the I<target> argument to 'disk'. The
1756 # default value 'mem' will keep the whole data object in RAM.
1758 # The I<keep_first_row_only> argument can be used to reduce
1759 # the size of the subset data obejct by excluding all but the
1760 # first row of data from each individual.
1762 my @individuals = @{$self -> {'individuals'}};
1763 # Create remainder index array if necessary
1764 my @remainder_indexes;
1765 if ( defined $individuals[0] ) {
1766 my @data = @{$individuals[0] -> {'subject_data'}};
1767 my $idcolumn = $individuals[0] -> {'idcolumn'};
1768 # print "IC: $idcolumn\n";
1769 my $id_flag = 0;
1770 foreach my $use_index ( @column_indexes ) {
1771 $id_flag = 1 if ( $use_index == $idcolumn );
1773 if ( $return_remainder ) {
1774 # @remainder_indexes = ( $idcolumn );
1775 for ( my $i = 0; $i < scalar split(/,/,$data[0]); $i++ ) {
1776 my $rem_flag = 1;
1777 foreach my $use_index ( @column_indexes ) {
1778 $rem_flag = 0 if ( $i == $use_index -1 );
1779 # or
1780 # $i == $idcolumn -1 );
1782 push( @remainder_indexes, $i + 1 ) if ( $rem_flag );
1784 unshift( @remainder_indexes, $idcolumn ) if ( $id_flag );
1786 unshift( @column_indexes, $idcolumn ) unless ( $id_flag );
1787 } else {
1788 die "data -> single_valued_data: No data in ID number 1\n";
1791 # print "SS: @column_indexes\n";
1792 # print "R : @remainder_indexes\n";
1794 my @new_ids;
1795 my @new_ids_2;
1796 for ( my $id = 0; $id <= $#individuals; $id++ ) {
1797 my $idnumber = $individuals[$id] -> idnumber;
1798 my $idcolumn = $individuals[$id] -> idcolumn;
1799 my @data = @{$individuals[$id] -> {'subject_data'}};
1800 my @new_data;
1801 my @new_data_2;
1802 my $use_rows = $keep_first_row_only ? 0 : $#data;
1803 for ( my $i = 0; $i <= $use_rows; $i++ ) {
1804 my @new_row;
1805 my @data_row = split( /,/, $data[$i] );
1806 foreach my $use_index ( @column_indexes ) {
1807 push( @new_row, $data_row[$use_index-1] );
1809 # print "@new_row $#new_row\n";
1810 push( @new_data, join( ',', @new_row ) );
1812 for ( my $i = 0; $i <= $#data; $i++ ) {
1813 if ( $return_remainder ) {
1814 my @new_row_2;
1815 my @data_row = split( /,/, $data[$i] );
1816 foreach my $use_index ( @remainder_indexes ) {
1817 push( @new_row_2, $data_row[$use_index-1] );
1819 # print "@new_row_2 $#new_row_2\n";
1820 push( @new_data_2, join( ',' , @new_row_2 ) );
1823 my $new_id = data::individual -> new( idnumber => $idnumber,
1824 idcolumn => $idcolumn,
1825 subject_data => \@new_data );
1826 push( @new_ids, $new_id );
1827 if ( $return_remainder ) {
1828 my $new_id_2;
1829 $new_id_2 = data::individual -> new( idnumber => $idnumber,
1830 idcolumn => $idcolumn,
1831 subject_data => \@new_data_2 );
1832 push( @new_ids_2, $new_id_2 );
1835 my @header = @{$self -> {'header'}};
1836 my @new_header;
1837 foreach my $use_index ( @column_indexes ) {
1838 push( @new_header, @header[$use_index-1] );
1840 my $comment;
1841 if( defined $self -> {'comment'} ){
1842 my @comment = @{$self -> {'comment'}};
1843 $comment = \@comment;
1845 $subset = data -> new ( filename => $subset_name,
1846 directory => $self -> {'directory'},
1847 ignoresign => $self -> {'ignoresign'},
1848 header => \@new_header,
1849 comment => $comment,
1850 individuals => \@new_ids,
1851 target => $target,
1852 ignore_missing_files => 1 );
1853 if ( $return_remainder ) {
1854 my @new_header_2;
1855 foreach my $use_index ( @remainder_indexes ) {
1856 push( @new_header_2, @header[$use_index-1] );
1858 $remainder = data -> new ( filename => $remainder_name,
1859 directory => $self -> {'directory'},
1860 ignoresign => $self -> {'ignoresign'},
1861 header => \@new_header_2,
1862 comment => $comment,
1863 individuals => \@new_ids_2,
1864 target => $target,
1865 ignore_missing_files => 1 );
1868 end subset_vertically
1870 # }}}
1872 # {{{ subsets
1874 start subsets
1876 # if ( defined $expression and defined $bins ) {
1877 # die "data -> subset: expression and bins may not both be specified\n";
1879 # if ( not ( defined $expression or defined $bins ) ) {
1880 # die "data -> subset: expression or bins must be specified\n";
1882 $self -> synchronize;
1883 my @header = @{$self -> {'header'}};
1884 my @comment = defined $self -> {'comment'} ? @{$self -> {'comment'}} : ();
1885 my @subset_ids= ();
1886 my %rnd_ids;
1887 my $key = 0;
1888 my @ids = @{$self -> {'individuals'}};
1889 if ( defined $stratify_on ) {
1890 my $work_data = $self -> copy( filename => 'work_data.dta',
1891 target => 'mem' );
1892 my %strata = %{$work_data -> factors( column => $stratify_on )};
1893 # $Data::Dumper::Maxdepth = 1;
1894 # print Dumper \%strata;
1896 while ( my ( $factor, $keys ) = each %strata ) {
1897 foreach my $key ( @{$keys} ) {
1898 my $rnd_num = rand;
1899 while ( defined $rnd_ids{$factor}{$rnd_num} ) {
1900 $rnd_num = rand;
1902 $rnd_ids{$factor}{$rnd_num} = $ids[$key];
1905 my $first = 1;
1906 while ( my ( $factor, $rnd_nums ) = each %rnd_ids ) {
1907 my @sort_rnd_nums = sort { $a <=> $b } keys %{$rnd_nums};
1908 for ( my $i = 0; $i <= $#sort_rnd_nums; $i ) {
1909 for ( my $j = 0; $j < $bins; $j++ ) {
1910 if ( $first ) {
1911 push( @subset_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy] );
1912 push( @incl_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber] );
1913 } else {
1914 push( @{$subset_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy );
1915 push( @{$incl_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber );
1917 $i++;
1918 last if $i > $#sort_rnd_nums;
1920 $first = 0;
1923 for ( my $j = 0; $j < $bins; $j++ ) {
1924 my $sdata = data -> new ( header => \@header,
1925 comment => \@comment,
1926 ignoresign => $self -> {'ignoresign'},
1927 individuals => $subset_ids[$j],
1928 ignore_missing_files => 1,
1929 target => 'disk',
1930 idcolumn => $self -> {'idcolumn'},
1931 filename => "subset_$j.dta" );
1932 #$sdata -> _write;
1933 push( @subsets, $sdata );
1935 } else {
1936 for ( my $i = 0; $i <= $#ids; $i++ ) {
1937 my $rnd_num = rand;
1938 while ( defined $rnd_ids{$rnd_num} ) {
1939 $rnd_num = rand;
1941 $rnd_ids{$rnd_num} = $ids[$i];
1943 my @keys = sort { $a <=> $b } keys %rnd_ids;
1944 my $first = 1;
1945 for ( my $i = 0; $i <= $#keys; $i ) {
1946 for ( my $j = 0; $j < $bins; $j++ ) {
1947 if ( $first ) {
1948 push( @subset_ids, [$rnd_ids{$keys[$i]} -> copy] );
1949 push( @incl_ids, [$rnd_ids{$keys[$i]} -> idnumber] );
1950 } else {
1951 push( @{$subset_ids[$j]}, $rnd_ids{$keys[$i]} -> copy );
1952 push( @{$incl_ids[$j]}, $rnd_ids{$keys[$i]} -> idnumber );
1954 $i++;
1955 last if $i > $#keys;
1957 $first = 0;
1959 for ( my $j = 0; $j < $bins; $j++ ) {
1960 my $sdata = data -> new ( header => \@header,
1961 comment => \@comment,
1962 ignoresign => $self -> {'ignoresign'},
1963 individuals => $subset_ids[$j],
1964 ignore_missing_files => 1,
1965 target => $target,
1966 idcolumn => $self -> {'idcolumn'},
1967 filename => "subset_$j.dta" );
1968 #$sdata -> _write;
1969 push( @subsets, $sdata );
1973 end subsets
1975 # }}} subsets
1977 # {{{ subset
1979 start subset
1981 $self -> synchronize;
1982 my @header = @{$self -> {'header'}};
1983 my @comment = defined $self -> {'comment'} ? @{$self -> {'comment'}} : ();
1984 my @subset_inds = ();
1985 my $key = 0;
1986 foreach my $individual ( @{$self -> {'individuals'}} ) {
1987 if ( $individual -> evaluate_expression( column => $based_on,
1988 expression => $expression ) ) {
1989 push( @subset_inds, $individual -> copy );
1990 push( @incl_individuals, $individual -> idnumber );
1991 push( @included_keys, $key );
1993 $key++;
1995 $subset = data -> new ( header => \@header,
1996 comment => \@comment,
1997 ignoresign => $self -> {'ignoresign'},
1998 individuals => \@subset_inds,
1999 idcolumn => $self -> {'idcolumn'},
2000 filename => "subset.dta" );
2002 end subset
2004 # }}} subset
2006 # {{{ target
2008 start target
2010 if ( $parm eq 'disk' and $self -> {'target'} eq 'mem' ) {
2011 $self -> {'target'} = 'disk';
2012 $self -> flush;
2013 } elsif ( $parm eq 'mem' and $self -> {'target'} eq 'disk' ) {
2014 $self -> {'target'} = 'mem';
2015 $self -> synchronize;
2018 end target
2020 # }}}
2022 # {{{ _write
2024 start _write
2026 die "ERROR: data -> _write: No filename set in data object.\n"
2027 if( $filename eq '' );
2029 # $Data::Dumper::Maxdepth = 2;
2030 # die Dumper $self -> {'individuals'};
2032 if( not defined $self -> {'individuals'} ){
2034 # If we don't have any individuals and write to a new
2035 # filename, we must first read individuals from the old
2036 # file. A call to synchronize will do that. There is no risk
2037 # of a infinite loop here since synchronize allways writes to
2038 # "full_name".
2040 unless( $filename eq $self -> full_name ){
2041 $self -> synchronize;
2045 open(FILE,">$filename") ||
2046 die "Could not create $filename\n";
2047 my $data_ref = $self -> format_data;
2048 my @data = @{$data_ref};
2049 for ( @data ) {
2050 print ( FILE );
2052 close(FILE);
2054 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2055 # $self -> {'use_data_table'} ) {
2056 # # Backslashes messes up the sql syntax
2057 # my $file_str = $self->{'filename'};
2058 # my $dir_str = $self->{'directory'};
2059 # $file_str =~ s/\\/\//g;
2060 # $dir_str =~ s/\\/\//g;
2062 # # md5sum
2063 # my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
2064 # my ( $date_str, $time_str );
2065 # if ( $Config{osname} eq 'MSWin32' ) {
2066 # $date_str = `date /T`;
2067 # $time_str = ' '.`time /T`;
2068 # } else {
2069 # # Assuming UNIX
2070 # $date_str = `date`;
2072 # chomp($date_str);
2073 # chomp($time_str);
2074 # my $date_time = $date_str.$time_str;
2075 # my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2076 # ";databse=".$PsN::config -> {'_'} -> {'project'},
2077 # $PsN::config -> {'_'} -> {'user'},
2078 # $PsN::config -> {'_'} -> {'password'},
2080 # 'RaiseError' => 1});
2081 # my $sth;
2082 # if ( defined $self -> {'data_id'} ) {
2083 # $sth = $dbh -> prepare( "UPDATE ".$PsN::config -> {'_'} -> {'project'}.
2084 # ".data ".
2085 # "SET filename='$file_str',date='$date_time',".
2086 # "directory='$dir_str',md5sum='$md5sum' ".
2087 # "WHERE data_id='".$self -> {'data_id'}."'" );
2088 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2089 # } else {
2090 # $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2091 # ".data (filename,date,directory,md5sum) ".
2092 # "VALUES ('$file_str', '$date_time', '$dir_str','".
2093 # $md5sum."' )");
2094 # $sth -> execute;
2095 # $self -> {'data_id'} = $sth->{'mysql_insertid'};
2097 # $sth -> finish;
2098 # $dbh -> disconnect;
2101 end _write
2103 # }}} _write
2105 # {{{ flush
2106 start flush
2108 # synchronizes the object with the file on disk and empties
2109 # most of the objects attributes to save memory.
2110 if( defined $self -> {'individuals'} and
2111 ( !$self -> {'synced'} or $force ) ) {
2112 $self -> _write;
2114 # $self -> {'header'} = undef;
2115 $self -> {'comment'} = undef;
2116 $self -> {'individuals'} = undef;
2117 $self -> {'synced'} = 0;
2118 $self -> {'column_head_indices'} = undef;
2119 $self -> {'have_missing_data'} = undef;
2121 end flush
2122 # }}} flush
2124 # {{{ synchronize
2126 start synchronize
2128 # synchronizes the object with the file on disk
2129 unless( $self -> {'synced'} ){
2130 if( defined $self -> {'individuals'} and
2131 scalar @{$self -> {'individuals'}} > 0 ){
2132 # We should not read new data from file if we
2133 # have an individuals defined?
2134 # Perhaps there should be an attribute
2135 # 'from_file' that overrides this and reads in
2136 # the data from the file specified in filename
2137 # and overwrites whatever the object already
2138 # contains?
2139 # if( -e $self -> {'filename'} ){
2140 # $self -> _read_header;
2141 # $self -> _read_individuals;
2143 $self -> _write;
2144 } else {
2145 if( -e $self -> full_name ){
2146 unless( defined $self -> {'header'} and scalar @{$self -> {'header'}} > 0 ){
2147 $self -> _read_header;
2149 $self -> _read_individuals;
2150 } else {
2151 debug -> die( message => "Fatal error: datafile: " . $self -> full_name . " does not exist." );
2152 return;
2156 my $i = 1;
2157 foreach my $head ( @{$self -> {'header'}} ){
2158 $self -> {'column_head_indices'} -> {$head} = $i;
2159 $i++;
2161 $self -> {'synced'} = 1;
2163 end synchronize
2165 # }}} synchronize
2167 # {{{ _fisher_yates_shuffle
2169 start _fisher_yates_shuffle
2171 my $arr_ref = $parm{'array'};
2172 debug -> warn( level => 1,
2173 message => "Array of zero length received" )
2174 if ( scalar @{$arr_ref} < 1 );
2175 my $i;
2176 for ($i = @$arr_ref; --$i; ) {
2177 my $j = random_uniform_integer(1,0,$i);
2178 # my $j = int rand ($i+1);
2179 # print "$j $j_new\n";
2180 @$arr_ref[$i,$j] = @$arr_ref[$j,$i];
2183 end _fisher_yates_shuffle
2185 # }}} _fisher_yates_shuffle
2187 # {{{ _read_header
2189 start _read_header
2191 my $filename = $self -> full_name;
2192 my $ignoresign = $self -> ignoresign;
2193 my ( @data, @new_record, $row, $tmp_row, @header, $hdrstring );
2195 open(DATAFILE,"$filename") ||
2196 die "Could not open $filename for reading";
2197 my $columns;
2198 while (<DATAFILE>) {
2199 $tmp_row = $_;
2200 # @new_record = split(/\,|\s+/,$_);
2201 if ( ! (/^\s*\d+|^\s*\./) ) {
2202 $data[$row] = $tmp_row;
2203 $row++;
2204 } else {
2205 # We have reached the first data-row, return.
2206 $columns = scalar split(/\,\s*|\s+/);
2207 last;
2210 close(DATAFILE);
2212 if ( defined $self -> {'cont_column'} and not $self -> {'table_file'} ) {
2213 my $data_len = $#data;
2214 for ( my $i = $data_len; $i >= 0; $i-- ) {
2215 my @arr = split(/\,\s*|\s+/,$data[$i]);
2216 if ( $arr[$self -> {'cont_column'}-1] eq 'CONT' ) {
2217 my $start = $i == $data_len ? 0 : 1;
2218 for ( my $j = $start; $j <= $#arr; $j++ ) {
2219 if ( $j != ($self -> {'cont_column'}-1) ) {
2220 push( @header, $arr[$j] );
2223 pop( @data );
2226 $header[0] =~ s/$ignoresign//
2227 if ( defined $self->ignoresign );
2228 shift( @header ) if ( $header[0] eq "" );
2229 } else {
2230 chomp( $hdrstring = pop(@data));
2231 @header = split(/\,\s*|\s+/,$hdrstring);
2232 $header[0] =~ s/$ignoresign//
2233 if ( defined $self->ignoresign );
2234 shift( @header ) if ( $header[0] eq "" );
2235 if( $self -> {'table_file'} ) {
2236 my @new_header;
2237 for( my $i = 1; $i <= scalar @header; $i++ ) {
2238 if( $header[$i-1] eq 'CONT' ) {
2239 if ( defined $self -> {'cont_column'} and not $i == $self -> {'cont_column'} ) {
2240 debug -> warn( level => 1,
2241 message => "The supplied columns for the CONT data item (".
2242 $self -> {'cont_column'}.") does not match the column where the CONT ".
2243 "header was found ($i), using $i" );
2245 $self -> {'cont_column'} = $i;
2246 } else {
2247 push( @new_header, $header[$i-1] );
2250 @header = @new_header;
2251 for( my $i = 1; $i <= scalar @header; $i++ ) {
2252 if( $header[$i-1] eq 'ID' ) {
2253 if ( defined $self -> {'idcolumn'} and not $i == $self -> {'idcolumn'} ) {
2254 debug -> warn( level => 1,
2255 message => "The supplied columns for the ID data item (".
2256 $self -> {'idcolumn'}.") does not match the column where the CONT ".
2257 "header was found ($i), using $i" );
2259 $self -> {'idcolumn'} = $i;
2265 # I'm not certain on how to deal with this conflict. I'm leaving it commented because I believe this code should not be here.
2267 #<<<<<<< data_subs.pm
2268 # $header[0] =~ s/$ignoresign//
2269 # if ( defined $self->ignoresign );
2270 # shift( @header ) if ( $header[0] eq "" );
2271 #=======
2272 #>>>>>>> 1.28
2274 # It is ok with data sets without a header.
2275 # unless( scalar @header > 0 ){ debug -> die( message => 'Datafile ' . $self -> full_name . ' is empty.' ); }
2277 $self -> {'header'} = \@header;
2278 $self -> {'comment'} = \@data;
2279 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2280 # $self -> {'use_data_table'} ) {
2281 # my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2282 # ";databse=".$PsN::config -> {'_'} -> {'project'},
2283 # $PsN::config -> {'_'} -> {'user'},
2284 # $PsN::config -> {'_'} -> {'password'},
2285 # {'RaiseError' => 1});
2286 # if ( scalar @header < 1 ) {
2287 # for ( my $i = 1; $i <= $columns; $i++ ) {
2288 # push( @header, $i );
2291 # for ( my $i = 0; $i <= $#header; $i++ ) {
2292 # my $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2293 # ".data_column ".
2294 # "(name,number,data_id) ".
2295 # "VALUES ('".$header[$i]."', '".($i+1).
2296 # "', '".$self -> {'data_id'}."' )");
2297 # $sth -> execute;
2298 # push( @{$self -> {'data_column_ids'}}, $sth->{'mysql_insertid'} );
2299 # $sth -> finish;
2301 # $dbh -> disconnect;
2304 end _read_header
2306 # }}} _read_header
2308 # {{{ _read_individuals
2310 start _read_individuals
2312 my $idcol = $self -> idcolumn;
2313 my $filename = $self -> full_name;
2314 debug -> warn( level => 1,
2315 message => "Building array of individuals from file " . $self -> {'filename'} );
2316 open(DATAFILE,"$filename") ||
2317 die "Could not open $filename for reading";
2318 my ( @new_row, $new_ID, $old_ID, @init_data );
2319 my $buffer;
2320 my $lines = 0;
2321 while (sysread DATAFILE, $buffer, 4096) {
2322 $lines += ($buffer =~ tr/\n//);
2324 seek( DATAFILE, 0,0 );
2326 # For status bar:
2327 my $status_bar = status_bar -> new( steps => $lines );
2329 ui -> print( category => 'scm',
2330 message => "Reading data file: ".$self -> filename );
2331 ui -> print( category => 'scm',
2332 message => $status_bar -> print_step(),
2333 newline => 0);
2335 my ( $sth, $dbh, $first_row_id, $first_value_id );
2336 my $insert = 1;
2337 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2338 # $self -> {'use_data_table'} ) {
2339 # $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2340 # ";databse=".$PsN::config -> {'_'} -> {'project'},
2341 # $PsN::config -> {'_'} -> {'user'},
2342 # $PsN::config -> {'_'} -> {'password'},
2343 # {'RaiseError' => 1});
2344 # my $sth = $dbh -> prepare( "SELECT data_row_id FROM ".$PsN::config -> {'_'} -> {'project'}.
2345 # ".data_row ".
2346 # "WHERE data_id='".$self -> {'data_id'}."'" );
2347 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2348 # my $select_arr = $sth -> fetchall_arrayref;
2349 # if ( scalar @{$select_arr} > 0 ) {
2350 # for ( my $i = 0; $i < scalar @{$select_arr}; $i++ ) {
2351 # push( @{$self -> {'data_row_ids'}}, $select_arr->[$i][0] );
2353 # $sth = $dbh -> prepare( "SELECT data_value_id FROM ".$PsN::config -> {'_'} -> {'project'}.
2354 # ".data_value ".
2355 # "WHERE data_id='".$self -> {'data_id'}."'" );
2356 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2357 # my $select_val = $sth -> fetchall_arrayref;
2358 # for ( my $i = 0; $i < scalar @{$select_val}; $i++ ) {
2359 # push( @{$self -> {'data_value_ids'}}, $select_val->[$i][0] );
2361 # $insert = 0;
2362 # $dbh -> disconnect;
2363 # } else {
2364 # $dbh -> do( "LOCK TABLES ".$PsN::config -> {'_'} -> {'project'}.
2365 # ".data_row WRITE, ".$PsN::config -> {'_'} -> {'project'}.
2366 # ".data_value WRITE" );
2367 # $sth = $dbh -> prepare( "SELECT MAX(data_row_id) FROM ".$PsN::config -> {'_'} -> {'project'}.
2368 # ".data_row" );
2369 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2370 # my $select_arr = $sth -> fetchall_arrayref;
2371 # $first_row_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
2372 # $sth = $dbh -> prepare( "SELECT MAX(data_value_id) FROM ".$PsN::config -> {'_'} -> {'project'}.
2373 # ".data_value" );
2374 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2375 # my $select_arr = $sth -> fetchall_arrayref;
2376 # $first_value_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
2378 # $sth -> finish;
2381 my $insert_rows;
2382 my $insert_values;
2383 my $row_counter = 0;
2384 my $full_row;
2385 ROW: while ( <DATAFILE> ) {
2386 s/^ *//;
2387 my @new_row = split(/\,\s*|\s+/);
2388 # This regexp check is not time consuming.
2389 if ( /^\s*\d+|^\s*\./ ) {
2390 if ( defined $self -> {'cont_column'} ) {
2391 if ( $new_row[$self -> {'cont_column'} - 1] == 1 ) {
2392 if ( not $self -> {'table_file'} ) { # Skip the CONT=1 rows if this is a table file
2393 for ( my $i = $#new_row; $i > 0; $i-- ) {
2394 if ( $i != ($self -> {'cont_column'} - 1) ) {
2395 unshift( @{$full_row}, $new_row[$i] );
2399 next ROW;
2400 } else {
2401 for ( my $i = $#new_row; $i >= 0; $i-- ) {
2402 # if ( $i != ($self -> {'cont_column'} - 1) or $self -> {'table_file'} ) {
2403 if ( $i != ($self -> {'cont_column'} - 1) ) {
2404 unshift( @{$full_row}, $new_row[$i] );
2408 } else {
2409 @{$full_row} = @new_row;
2411 $new_ID = $full_row -> [$idcol-1]; # index starts at 0
2412 $old_ID = $new_ID if ( not defined $old_ID );
2414 # Check if column miss data at some row (This adds about 30% of init time)
2415 my $mdt = $self -> {'missing_data_token'};
2416 for( my $i = 0; $i <= $#{$full_row}; $i++ ){
2417 $self -> {'have_missing_data'} -> {$i+1} = 1
2418 if( $full_row -> [$i] == $mdt ); # == is slower but safer than eq
2420 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2421 # $self -> {'use_data_table'} and $insert ) {
2422 # $row_counter++;
2423 # $insert_rows = $insert_rows."," if ( defined $insert_rows );
2424 # $insert_rows = $insert_rows.
2425 # "('$row_counter', '".$self -> {'data_id'}."' )";
2426 # for ( my $j = 0; $j <= $#{$full_row}; $j++ ) {
2427 # $insert_values = $insert_values."," if ( defined $insert_values );
2428 # $insert_values = $insert_values.
2429 # "('".$full_row -> [$j]."', '".
2430 # ($first_row_id+$row_counter)."', '".
2431 # $self -> {'data_column_ids'}->[$j].
2432 # "', '".$self -> {'data_id'}."' )";
2436 if ( $new_ID != $old_ID ) {
2437 my @subject_data = @init_data;
2438 my $id = data::individual -> new ( idcolumn => $idcol,
2439 subject_data => \@subject_data,
2440 data_id => $self -> {'data_id'} );
2441 push( @{$self -> {'individuals'}}, $id );
2442 @init_data =(join( ",", @{$full_row}));
2443 } else {
2444 push( @init_data, join( ",", @{$full_row}) );
2446 $old_ID = $new_ID;
2447 $full_row = undef;
2449 if ( $status_bar -> tick() ) {
2450 ui -> print( category => 'scm',
2451 message => $status_bar -> print_step(),
2452 wrap => 0,
2453 newline => 0 );
2457 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2458 # $self -> {'use_data_table'} and $insert ) {
2459 # $dbh -> do("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2460 # ".data_row ".
2461 # "(number,data_id) ".
2462 # "VALUES ".$insert_rows);
2463 # push( @{$self -> {'data_row_ids'}}, ($first_row_id..$first_row_id+$row_counter) );
2464 # $dbh -> do( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2465 # ".data_value ".
2466 # "(value,data_row_id,data_column_id,data_id) ".
2467 # "VALUES ".$insert_values );
2468 # push( @{$self -> {'data_value_ids'}},
2469 # ($first_value_id..$first_value_id+($row_counter*
2470 # scalar @{$self->{'data_column_ids'}})));
2471 # $dbh -> do( "UNLOCK TABLES" );
2472 # $dbh -> disconnect;
2475 if ( $#init_data >= 0 ) {
2476 push( @{$self -> {'individuals'}},
2477 data::individual -> new ( idcolumn => $idcol,
2478 subject_data => \@init_data ) );
2480 ui -> print( category => 'scm',
2481 message => " ... done" );
2482 close(DATAFILE);
2483 # $self -> _write( filename => 'test.dta' );
2485 end _read_individuals
2487 # }}} _read_individuals