1 This file contains the code
for the subroutines
in
2 Perl
-speaks
-NONMEMs data module
. It is
not functional by itself
.
3 The code should be transferred to the module autogenerated by dia2code
4 using the fill_diacode
.pl script
.
8 start include statements
9 use Digest
::MD5
'md5_hex';
21 use Time
::HiRes
qw(gettimeofday);
22 my @primary_column_names = ('ID', 'DATE', 'DAT1', 'DAT2', 'DAT3' ,'L1', 'L2', 'DV', 'MDV', 'RAW_', 'MRG_', 'RPT_', 'TIME', 'DROP', 'SKIP', 'EVID', 'AMT', 'RATE', 'SS', 'II', 'ADDL', 'CMT', 'PCMT', 'CALL');
25 # }}} include statements
30 # The structure of the data class is subject-centric, recognising that
31 # the subjects included in a study often can be regarded as
32 # independent. A class for the subject level exists within PsN and is
33 # called the individual class. A data object consists of at least one
34 # but probably many individual objects plus optional comments.
44 # my $data_obj = data -> new ( filename => 'test040314.dta' );
46 # $data_obj -> renumber_ascending;
48 # my $subsets_ref = $data_obj -> case_deletion( bins => 10 );
50 # my @subsets = @{$subsets_ref};
60 # <a HREF="model.html">model</a>, <a HREF="output.html">output</a>,
61 # <a HREF="tool/modelfit.html">tool::modelfit</a>,
62 # <a HREF="tool.html">tool</a>
68 # model, output, tool::modelfit, tool
79 # If the column holding the subject identifier is not the
80 # first, it can be specified using the I<idcolumn> attribute
82 # I<ignoresign> determines which rows that are regarded as
83 # comments. Corresponds to the IGNORE= option in the $DATA
84 # record in a NONMEM model file.
86 $this -> {'use_data_table'} = 0;
88 ( $this -> {'directory'},
89 $this -> {'filename'} ) = OSspecific
::absolute_path
( $this -> {'directory'},
90 $this->{'filename'} );
92 debug
-> warn( level
=> 2,
93 message
=> "data -> new: Data object initialized from file: ".
96 # sub register_in_database {
98 # # Backslashes messes up the sql syntax
99 # my $file_str = $this->{'filename'};
100 # my $dir_str = $this->{'directory'};
101 # $file_str =~ s/\\/\//g;
102 # $dir_str =~ s/\\/\//g;
105 # my $md5sum = md5_hex(OSspecific::slurp_file($this-> full_name ));
107 # connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
108 # ";databse=".$PsN::config -> {'_'} -> {'project'},
109 # $PsN::config -> {'_'} -> {'user'},
110 # $PsN::config -> {'_'} -> {'password'},
111 # {'RaiseError' => 1});
113 # my $sth = $dbh -> prepare( "SELECT data_id FROM ".$PsN::config -> {'_'} -> {'project'}.
115 # "WHERE filename = '$file_str' AND ".
116 # "directory = '$dir_str' AND ".
117 # "md5sum = '".$md5sum."'" );
118 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
119 # my $select_arr = $sth -> fetchall_arrayref;
120 # if ( scalar @{$select_arr} > 0 ) {
121 # debug -> warn( level => 1,
122 # message => "Found an old entry in the database matching the ".
123 # "current data file" );
124 # if ( scalar @{$select_arr} > 1 ) {
125 # debug -> warn( level => 1,
126 # message => "Found more than one matching entry in database".
127 # ", using the first" );
129 # $this -> {'data_id'} = $select_arr->[0][0];
131 # my ( $date_str, $time_str );
132 # if ( $Config{osname} eq 'MSWin32' ) {
133 # $date_str = `date /T`;
134 # $time_str = ' '.`time /T`;
137 # $date_str = `date`;
141 # my $date_time = $date_str.$time_str;
142 # $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
143 # ".data (filename,date,directory,md5sum) ".
144 # "VALUES ('$file_str', '$date_time', '$dir_str','".
147 # $this -> {'data_id'} = $sth->{'mysql_insertid'};
150 # $dbh -> disconnect;
154 unless ( ( defined $this -> {'header'} and
155 scalar @
{$this -> {'header'}} > 0 ) or
156 ( defined $this -> {'individuals'} and
157 scalar @
{$this -> {'individuals'}} > 0 ) ) {
158 if ( -e
$this -> full_name
) {
159 if ( $this -> {'target'} eq 'mem' ) {
160 # ®ister_in_database( $this ) if ( $PsN::config -> {'_'} -> {'use_database'} and
161 # $this -> {'use_data_table'} );
162 $this -> _read_header
;
163 $this -> _read_individuals
;
164 $this -> {'synced'} = 1;
166 $this -> {'synced'} = 0;
169 debug
-> die(message
=> "No header, individuals, and no file " . $this -> full_name
. " on disk.")
170 unless $this -> {'ignore_missing_files'};
171 $this -> {'synced'} = 0;
174 if ( $this -> {'target'} eq 'mem') {
175 if ( -e
$this -> {'filename'} ) {
176 $this -> _read_header
;
177 # ®ister_in_database if ( $PsN::config -> {'_'} -> {'use_database'} and
178 # $this -> {'use_data_table'} );
179 $this -> _read_individuals
;
180 $this -> {'synced'} = 1;
182 debug
-> die(message
=> "No file:".$this->{'filename'}." on disk" )
183 unless $this -> {'ignore_missing_files'};
184 $this -> {'synced'} = 0;
191 if ( $this -> {'synced'} ) {
193 foreach my $head ( @
{$this -> {'header'}} ) {
194 $this -> {'column_head_indices'} -> {$head} = $i;
198 # $Data::Dumper::Maxdepth = 3;
199 # die Dumper $this -> {'individuals'};
205 # {{{ register_in_database
206 start register_in_database
207 if ( $PsN::config
-> {'_'} -> {'use_database'} ) {
208 # Backslashes messes up the sql syntax
209 my $file_str = $self->{'filename'};
210 my $dir_str = $self->{'directory'};
211 $file_str =~ s/\\/\//g
;
212 $dir_str =~ s/\\/\//g
;
214 my $project = $PsN::config
-> {'_'} -> {'project'};
216 my $md5sum = md5_hex
(OSspecific
::slurp_file
($self-> full_name
));
218 my $dbh = DBI
-> connect("DBI:mysql:host=".$PsN::config
-> {'_'} -> {'database_server'}.
219 ";databse=".$project,
220 $PsN::config
-> {'_'} -> {'user'},
221 $PsN::config
-> {'_'} -> {'password'},
222 {'RaiseError' => 1});
229 my $sth = $dbh -> prepare
( "SELECT data_id FROM ".$project.
231 "WHERE filename = '$file_str' AND ".
232 "directory = '$dir_str' AND ".
233 "md5sum = '".$md5sum."'" );
234 $sth -> execute
or debug
-> die( message
=> $sth->errstr ) ;
235 $select_arr = $sth -> fetchall_arrayref
;
238 if ( scalar @
{$select_arr} > 0 ) {
239 'debug' -> warn( level
=> 1,
240 message
=> "Found an old entry in the database matching the ".
241 "current data file" );
242 if ( scalar @
{$select_arr} > 1 ) {
243 'debug' -> warn( level
=> 1,
244 message
=> "Found more than one data matching entry in database".
245 ", using the first" );
247 $self -> {'data_id'} = $select_arr->[0][0];
249 my $sth = $dbh -> prepare
( "SELECT individual_id FROM ".$project.".data_individual ".
250 "WHERE data_id = '".$self -> {'data_id'}."'" );
251 $sth -> execute
or debug
-> die( message
=> $sth->errstr ) ;
252 my $id_arr = $sth -> fetchall_arrayref
;
253 map( $_ = $_ -> [0], @
{$id_arr} );
254 $self -> {'individual_ids'} = $id_arr;
255 } elsif ( defined $self -> {'individuals'} ) {
256 my ( $date_str, $time_str );
257 if( $Config{osname
} eq 'MSWin32' ){
258 $date_str = `date /T`;
259 $time_str = ' '.`time /T`;
266 my $date_time = $date_str.$time_str;
267 my ( $columns, $values );
268 my $res_str = $resampled ?
'1' : '0';
269 if ( defined $model_id ) {
270 $columns = '(model_id, filename, date, directory, md5sum, resampled)';
271 $values = "('$model_id', '$file_str', '$date_time', '$dir_str','".
272 $md5sum."', '$res_str' )";
274 $columns = '(filename, date, directory, md5sum, resampled)';
275 $values = "('$file_str', '$date_time', '$dir_str','".$md5sum."', '$res_str' )";
277 $sth = $dbh -> prepare
("INSERT INTO ".$PsN::config
-> {'_'} -> {'project'}.
278 ".data $columns VALUES $values");
280 $self -> {'data_id'} = $sth->{'mysql_insertid'};
282 if ( defined $self -> {'data_id'} ) {
284 my $columns = "( id_key, id )";
285 if( $#individual_ids >= 0 ) {
286 $self -> register_di_relation
( individual_ids
=> \
@individual_ids );
288 my $inds = scalar @
{$self -> {'individuals'}};
289 $dbh -> do( "LOCK TABLES ".$PsN::config
-> {'_'} -> {'project'}.
290 ".individual WRITE" );
291 # $sth = $dbh -> prepare( "SELECT MAX(individual_id)".
292 # " FROM ".$PsN::config -> {'_'} -> {'project'}.
294 $dbh -> do( 'USE '.$PsN::config
-> {'_'} -> {'project'} );
295 $sth = $dbh -> prepare
( "SHOW TABLE STATUS LIKE 'individual'" );
296 $sth -> execute
or debug
-> die( message
=> $sth->errstr ) ;
297 my $select_arr = $sth -> fetchall_arrayref
;
298 my $first_id_id = $select_arr -> [0][10] ?
299 $select_arr -> [0][10] : 0;
300 # my $first_id_id = $select_arr -> [0][0] ? ($select_arr -> [0][0] + 1) : 0;
301 my $last_id_id = $first_id_id + $inds - 1;
302 for( my $i = 0; $i < $inds; $i++ ) {
303 if( defined $self -> {'individuals'}[$i] ) {
304 my $id_id = $self -> {'individuals'}[$i] -> idnumber
;
305 $values = $values."," if ( defined $values );
306 $values = $values."( $i, $id_id )";
309 $sth = $dbh -> prepare
( "INSERT INTO ".$PsN::config
-> {'_'} -> {'project'}.
310 ".individual $columns VALUES $values" );
312 $dbh -> do( "UNLOCK TABLES" );
313 @individual_ids = ($first_id_id .. $last_id_id);
314 $self -> register_di_relation
( individual_ids
=> \
@individual_ids );
316 $self -> {'individual_ids'} = \
@individual_ids;
321 $data_id = $self -> {'data_id'}; # return the data_id
323 end register_in_database
324 # }}} register_in_database
326 # {{{ register_di_relation
327 start register_di_relation
328 if ( $PsN::config
-> {'_'} -> {'use_database'} and
329 defined $self -> {'data_id'} and $#individual_ids >= 0 ) {
330 my $dbh = DBI
-> connect("DBI:mysql:host=".$PsN::config
-> {'_'} -> {'database_server'}.
331 ";databse=".$PsN::config
-> {'_'} -> {'project'},
332 $PsN::config
-> {'_'} -> {'user'},
333 $PsN::config
-> {'_'} -> {'password'},
334 {'raiseerror' => 1});
337 my $columns = "( data_id, individual_id )";
338 foreach my $individual_id ( @individual_ids ) {
339 if ( defined $individual_id ) {
340 $values = $values."," if ( defined $values );
341 $values = $values."(".$self -> {'data_id'}.", $individual_id )";
344 $sth = $dbh -> prepare
( "INSERT INTO ".$PsN::config
-> {'_'} -> {'project'}.
345 ".data_individual $columns VALUES $values" );
347 $sth -> finish
if ( defined $sth );
350 end register_di_relation
351 # }}} register_di_relation
357 $full_name = $self -> {'directory'} . $self -> {'filename'};
367 # The bootstrap method draws I<samples> number of boostrap
368 # samples from the data set. The I<subjects> arguments
369 # determines the size of each sample (default equals to the
370 # number of individuals in the original data set). The method
371 # returns references to three arrays: I<boot_samples_ref>,
372 # which holds the bootstrap data sets, I<incl_individuals_ref>
373 # which holds arrays containing the subject identifiers (ID's)
374 # for the included individuals of each bootstrap data set and
375 # I<included_keys_ref> which holds the key or index of the
376 # included individuals. The key or index is an integer
377 # starting at 1 for the first individual in the original data
378 # set and increasing by one for each following.
379 $self -> synchronize
;
380 my @header = @
{$self -> {'header'}};
381 my $individuals = $self -> {'individuals'};
384 my $status_bar = status_bar
-> new
( steps
=> $samples );
385 ui
-> print( category
=> 'bootstrap',
386 message
=> $status_bar -> print_step
,
389 for ( my $i = 1; $i <= $samples; $i++ ) {
390 my $new_name = defined $name_stub ?
$name_stub."_$i.dta" : "bs$i.dta";
391 $new_name = $directory.'/'.$new_name;
392 my ( $boot, $incl_ind_ref, $incl_key_ref ) =
393 $self -> resample
( subjects
=> \
%subjects,
395 new_name
=> $new_name,
397 stratify_on
=> $stratify_on,
398 model_id
=> $model_ids[$i-1] );
399 push( @included_keys, $incl_key_ref );
400 push( @incl_individuals, $incl_ind_ref );
401 # $boot -> renumber_ascending;
402 push( @boot_samples, $boot );
403 # $boot -> synchronize;
405 if( $status_bar -> tick
() ){
406 ui
-> print( category
=> 'bootstrap',
407 message
=> $status_bar -> print_step
,
411 # print Dumper \@boot_samples;
414 ui
-> print( category
=> 'bootstrap',
415 message
=> ' ... done' );
425 $self -> synchronize
;
426 my ( @header, $individuals, @bs_inds, $key_ref, @id_ids, @bs_id_ids );
427 @id_ids = @
{$self -> {'individual_ids'}} if( defined $self -> {'individual_ids'} );
428 my @subj_keys = keys( %subjects );
429 if ( $#subj_keys < 0 ) {
430 debug
-> die( message
=> "sample_size must be defined" );
432 if ( defined $stratify_on ) {
434 if( $stratify_on =~ /\D/ ){
435 %strata = %{$self -> factors
( column_head
=> $stratify_on )};
436 if ( $strata{'Non-unique values found'} eq '1' ) {
437 debug
-> die( message
=> "Individuals were found to have multiple values in the $stratify_on column. ".
438 "The column $stratify_on cannot be used for stratification of the resampling." );
441 %strata = %{$self -> factors
( column
=> $stratify_on )};
442 if ( $strata{'Non-unique values found'} eq '1' ) {
443 debug
-> die( message
=> "Individuals were found to have multiple values in column number $stratify_on. ".
444 "Column $stratify_on cannot be used for stratification of the resampling." );
447 if ( scalar keys( %subjects) != scalar keys( %strata ) and
448 not ( $#subj_keys == 0 and defined $subjects{'default'} ) ) {
449 debug
-> die( message
=> "sample_size must be defined using one default value ".
450 "or exactly one value per strata:\n".
451 "resampling per STUD=1001,1002,1003\n".
452 "use -sample_size='1001=>10,1002=>25,1003=>12' or ".
453 "-sample_size='default=>10'");
455 unless ( $resume and -e
$new_name ) {
456 @header = @
{$self -> {'header'}};
457 $individuals = $self -> {'individuals'};
458 while( my ( $factor, $key_list ) = each %strata ) {
460 if ( defined $subjects{$factor} ) {
461 $keys = $subjects{$factor};
462 } elsif( defined $subjects{'default'} ) {
463 $keys = sprintf( "%.0f",($subjects{'default'}*
464 (scalar(@
{$key_list}))/($self -> count_ind
())) );
466 debug
-> die( message
=> "A sample size for strata $factor could not be found ".
467 "and no default sample size was set" );
469 for ( my $i = 0; $i < $keys; $i++ ) {
470 my $list_ref = random_uniform_integer
(1,0,(scalar(@
{$key_list}) - 1));
471 push( @bs_inds, $individuals ->
472 [ $key_list -> [$list_ref] ] -> copy
);
473 push( @included_keys, $key_list -> [$list_ref] );
474 push( @incl_individuals, $individuals ->
475 [ $key_list -> [$list_ref] ] -> idnumber
);
476 push( @bs_id_ids, $id_ids[ $key_list -> [$list_ref] ] );
480 $boot = data
-> new
( header
=> \
@header,
481 idcolumn
=> $self -> {'idcolumn'},
482 ignoresign
=> $self -> {'ignoresign'},
483 individuals
=> \
@bs_inds,
484 filename
=> $new_name,
485 ignore_missing_files
=> 1,
487 $boot -> renumber_ascending
;
490 #$boot -> target( $target );
492 # If we are resuming, we still need to generate the
493 # pseudo-random sequence and initiate a data object
494 while( my ( $factor, $key_list ) = each %strata ) {
496 if ( defined $subjects{$factor} ) {
497 $keys = $subjects{$factor};
498 } elsif( defined $subjects{'default'} ) {
499 $keys = sprintf( "%.0f",($subjects{'default'}*
500 (scalar(@
{$key_list}))/($self -> count_ind
())) );
502 debug
-> die( message
=> "A sample size for strata $factor could not be found ".
503 "and no default sample size was set" );
505 for ( my $i = 0; $i < $keys; $i++ ) {
506 my $list_ref = random_uniform_integer
(1,0,(scalar(@
{$key_list}) - 1));
509 $boot = data
-> new
( idcolumn
=> $self -> {'idcolumn'},
510 ignoresign
=> $self -> {'ignoresign'},
511 filename
=> $new_name,
512 ignore_missing_files
=> 1,
520 if( defined $subjects{'default'} ) {
521 $size = $subjects{'default'};
523 debug
-> die( message
=> "No default sample size was set" );
525 unless ( $resume and -e
$new_name ) {
526 @header = @
{$self -> {'header'}};
527 $individuals = $self -> {'individuals'};
528 for ( my $i = 1; $i <= $size; $i++ ) {
529 $key_ref = random_uniform_integer
(1,0,scalar @
{$individuals}-1);
530 push( @bs_inds, $individuals -> [ $key_ref ] -> copy
);
531 push( @included_keys, $key_ref );
532 push( @incl_individuals, $individuals -> [ $key_ref ] -> idnumber
);
533 push( @bs_id_ids, $id_ids[ $key_ref ] );
536 # MUST FIX: If a file already exists with the same name,
537 # the created bs data set will be appended to this. IT
538 # MUST BE OVERWRITTEN!
539 $boot = data
-> new
( header
=> \
@header,
540 idcolumn
=> $self -> {'idcolumn'},
541 ignoresign
=> $self -> {'ignoresign'},
542 individuals
=> \
@bs_inds,
543 filename
=> $new_name,
544 ignore_missing_files
=> 1,
546 $boot -> renumber_ascending
;
548 $boot -> target
( $target );
550 # If we are resuming, we still need to generate the
551 # pseudo-random sequence and initiate a data object
552 for ( my $i = 1; $i <= $size; $i++ ) {
553 random_uniform_integer
(1,0,scalar @
{$individuals}-1)
555 $boot = data
-> new
( idcolumn
=> $self -> {'idcolumn'},
556 ignoresign
=> $self -> {'ignoresign'},
557 filename
=> $new_name,
558 ignore_missing_files
=> 1,
564 if( $target eq 'disk'){
568 $boot -> register_in_database
( individual_ids
=> \
@bs_id_ids,
570 model_id
=> $model_id );
580 # case_deletion creates subsets of the data. The number of
581 # subsets is specified by the bins argument. The individuals
582 # of each subset is selected randomly or in ascending
583 # numerical order depending on the selection argument that can
584 # be either 'consecutive' or 'random'. case_column must be
585 # specified to give the method something to base the selection
586 # on. Valid case_column values are either the column number
587 # (pure digits) or the name of the column in the (optional)
589 $self -> synchronize
;
590 my @header = @
{$self -> {'header'}};
591 if ( not defined $case_column ) {
592 debug
-> die( message
=> "case_column must be specified" );
594 if ( not $case_column =~ /^\d/ ) {
595 for ( my $i = 0; $i <= $#header; $i++ ) {
596 $case_column = $i+1 if ( $header[$i] eq $case_column );
600 $bins = defined $bins ?
$bins :
601 scalar keys %{$self -> factors
( column
=> $case_column)};
602 my %factors = %{$self -> factors
( column
=> $case_column )};
603 if ( $factors{'Non-unique values found'} eq '1' ) {
604 debug
-> die( message
=> "Individuals were found to have multiple values in column number $case_column. ".
605 "Column $case_column cannot be used for case deletion." );
608 my $maxbins = scalar keys %factors;
609 my @ftrs = sort { $a <=> $b } keys %factors;
610 my $individuals = $self -> {'individuals'};
611 my $maxkey = scalar @
{$individuals} - 1;
613 my ( @tmp_ftrs, @binsize ) =
615 my ( $k, $j, $i ) = ( 0, 0, 0 );
616 # Create the binsizes
617 for ( $j = 0; $j < $maxbins; $j++ ) {
619 $k = 0 if( $k >= $bins );
621 $self -> _fisher_yates_shuffle
( array
=> \
@ftrs ) if( $selection eq 'random' );
622 for ( $k = 0; $k < $bins; $k++ ) {
623 for ( $j = 0; $j < $binsize[ $k ]; $j++ ) {
624 # print "SK: ",$skipped_keys[ $k ]," F: ",$factors{ $ftrs[ $i ] },"\n";
625 push( @
{$skipped_keys[ $k ]}, @
{$factors{ $ftrs[ $i ] }} );
626 push( @
{$skipped_values[ $k ]}, $ftrs[ $i++ ] );
630 for ( $k = 0; $k < $bins; $k++ ) {
633 SELKEYS
: foreach my $key ( 0..$maxkey ) {
634 foreach my $skipped ( @
{$skipped_keys[ $k ]} ) {
635 if ( $key == $skipped ) {
636 push( @
{$skipped_ids[ $k ]}, $individuals ->
637 [ $skipped ] -> idnumber
);
638 push( @del_inds, $individuals -> [ $key ] -> copy
);
642 push( @cd_inds, $individuals -> [ $key ] -> copy
);
644 # Set ignore_missing_files = 1 to make it possible to get the result
646 my $newdata = data
->
647 new
( header
=> \
@header,
648 ignoresign
=> $self -> {'ignoresign'},
649 idcolumn
=> $self -> {'idcolumn'},
650 individuals
=> \
@cd_inds,
652 filename
=> $directory.'/cdd_'.($k+1).'.dta',
653 ignore_missing_files
=> 1 );
654 my $deldata = data
->
655 new
( header
=> \
@header,
656 ignoresign
=> $self -> {'ignoresign'},
657 idcolumn
=> $self -> {'idcolumn'},
658 individuals
=> \
@del_inds,
660 filename
=> $directory.'/rem_'.($k+1).'.dta',
661 ignore_missing_files
=> 1 );
662 push( @subsets, $newdata );
663 push( @remainders, $deldata );
677 # filename: new data file name.
679 # target: keep the copy in memory ('mem') or write it to disk and flush the memory ('disk').
681 ($directory, $filename) = OSspecific
::absolute_path
( $directory, $filename );
683 # Clone self into new data object. Why don't the individuals get cloned too?
684 # strange. need to set synced to 0 AND set the {'individuals'} to undef.
685 cp
($self -> full_name
, $directory.$filename );
686 $new_data = Storable
::dclone
( $self );
687 $new_data -> {'synced'} = 0;
688 $new_data -> {'individuals'} = undef;
689 $new_data -> synchronize
;
691 # Set the new file name for the copy
692 $new_data -> directory
( $directory );
693 $new_data -> filename
( $filename );
699 # {{{ column_to_array
700 start column_to_array
702 $self -> synchronize
;
704 if ( not $column =~ /^\d/ ) {
705 $column = $self -> {'column_head_indices'} -> {$column} - 1;
708 if( $column < 0 or $column > $#{$self -> {'header'}} ){
712 foreach my $individual ( @
{$self -> individuals
} ){
713 foreach my $individual_row( @
{$individual -> subject_data
} ){
714 my @row = split(/,/ , $individual_row);
715 push( @array, $row[$column] );
726 # Returns the number of individuals in the data set.
727 $self -> synchronize
;
728 if( defined $self -> individuals
() ) {
729 $num = scalar @
{$self -> individuals
()};
731 debug
-> die( message
=> "No individuals found in file ".
732 $self -> filename
() );
742 $self -> synchronize
;
744 my $first_id = $self -> {'individuals'}[0];
746 debug
-> die( message
=> "No individuals defined in data object based on ".
747 $self -> full_name
) unless ( defined $first_id );
749 # Check if $column(-index) is defined and valid, else try to find index
752 my @data_row = split( /,/, $first_id -> subject_data
-> [0] );
753 if( $#columns >= 0 ) {
754 foreach my $column ( @columns ) {
755 unless ( defined $column && defined( $data_row[$column-1] ) ) {
756 debug
-> die( message
=> "Error in data -> factors: ".
757 "invalid column number: \"$column\"\n".
758 "Valid column numbers are 1 to ".
759 scalar @
{$first_id -> subject_data
->[0]}."\n" );
762 } elsif ( $#column_heads >= 0 ) {
763 foreach my $column_head ( @column_heads ) {
764 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
765 debug
-> die( message
=> "Error in data -> factors: unknown column: \"$column_head\" ".
766 "Valid column headers are (in no particular order):\n".
767 join(', ',keys(%{$self -> {'column_head_indices'}})) );
769 my $column = $self -> {'column_head_indices'}{$column_head};
770 push( @columns, $column );
771 debug
-> warn( level
=> 2,
772 message
=> "$column_head is in column number $column" );
776 debug
-> die( message
=> "No column or column_head defined" );
779 if( $global_largest or $global_smallest or
780 $largest_per_individual or $smallest_per_individual ) {
781 if( not scalar @
{$self -> {'individuals'}} == scalar @
{$against_data -> individuals
} ) {
782 debug
-> die( message
=> "Both data object must hold the same number of individuals ".
783 "and observations when calling data -> diff" );
785 for( my $i = 0; $i < scalar @
{$self -> {'individuals'}}; $i++ ) {
786 my %id_diffs = %{$self -> {'individuals'}[$i] ->
787 diff
( against_individual
=> $against_data -> individuals
-> [$i],
788 columns
=> \
@columns,
789 absolute_diff
=> $absolute_diff,
790 diff_as_fraction
=> $diff_as_fraction,
791 largest
=> ( $global_largest or $largest_per_individual ),
792 smallest
=> ( $global_smallest or $smallest_per_individual ) )};
793 if( $global_largest ) {
794 for( my $j = 0; $j <= $#columns; $j++ ) {
795 my $label = defined $column_heads[$j] ?
$column_heads[$j] : $columns[$j];
796 if( not defined $diff_results{$label} or not defined $diff_results{$label}{'diff'} or
797 $id_diffs{$columns[$j]}{'diff'} > $diff_results{$label}{'diff'} ) {
798 $diff_results{$label}{'diff'} = $id_diffs{$columns[$j]}{'diff'};
799 $diff_results{$label}{'self'} = $id_diffs{$columns[$j]}{'self'};
800 $diff_results{$label}{'test'} = $id_diffs{$columns[$j]}{'test'};
806 die "data -> diff is only implemented for finding the largest difference at any observation at this point\n";
815 if ( defined $parm and $parm ne $self -> {'filename'} ) {
816 $self -> {'filename'} = $parm;
817 $self -> {'data_id'} = undef;
828 my %factors = $self -> factors
( 'return_occurences' => 1,
829 'unique_in_individual' => $unique_in_individual,
830 'column_head' => $column_head,
831 'column' => $column);
834 while (my ($factor, $amount) = each %factors) {
835 if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
841 while (my ($factor, $amount) = each %factors) {
842 if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
845 $fractions{$factor} = $amount/$sum;
857 # Either column (number, starting at 1) or column_head must be specified.
859 # The default behaviour is to return a hash with the factors as keys
860 # and as values references to arrays with the order numbers (not the ID numbers)
861 # of the individuals that contain this factor
863 # If unique_in_individual is true (1), the returned hash will contain
864 # an element with key 'Non-unique values found' and value 1 if any
865 # individual contain more than one value in the specified column.
867 # Return occurences will calculate the occurence of each
868 # factor value. Several occurences in one individual counts as
869 # one occurence. The elements of the returned hash will have the factors
870 # as keys and the number of occurences as values.
873 $self -> synchronize
;
875 # Check if $column(-index) is defined and valid, else try to find index
877 my $first_id = $self -> {'individuals'}[0];
879 debug
-> die( message
=> "No individuals defined in data object based on ".
880 $self -> full_name
) unless ( defined $first_id );
882 my @data_row = split( /,/, $first_id -> subject_data
-> [0] );
883 unless ( defined $column && defined( $data_row[$column-1] ) ) {
884 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
885 debug
-> die( message
=> "Error in data -> factors: unknown column: \"$column_head\" ".
886 "or invalid column number: \"$column\".\n".
887 "Valid column numbers are 1 to ".scalar @data_row ."\n".
888 "Valid column headers are (in no particular order):\n".
889 join(', ',keys(%{$self -> {'column_head_indices'}})) );
891 $column = $self -> {'column_head_indices'}{$column_head};
892 debug
-> warn( level
=> 2,
893 message
=> "$column_head is in column number $column" );
898 foreach my $individual ( @
{$self -> {'individuals'}} ) {
899 my @ifactors = keys %{$individual -> factors
( column
=> $column )};
900 if ( scalar @ifactors > 1 and $unique_in_individual ) {
901 %factors = ( 'Non-unique values found' => 1 );
904 debug
-> die( message
=> "No value found in column $column in individual ".
905 $individual -> idnumber
) if ( scalar @ifactors == 0 );
907 # Return occurences will calculate the occurence of each
908 # factor value. Several occurences in one individual counts as
911 if ( $return_occurences ) {
912 foreach my $ifactor ( @ifactors ) {
913 $factors{$ifactor}++;
916 foreach my $ifactor ( @ifactors ) {
917 push( @
{$factors{$ifactor}}, $key );
927 # {{{ find_individual
929 # start find_individual
930 # foreach my $tmp_ind ( @{$self -> individuals} ) {
931 # if ( $tmp_ind -> key == $key ) {
932 # $individual = $tmp_ind;
936 # if ( defined $individual ) {
938 # $individual = $individual -> copy;
941 # print "No individual with key $key found in call to ".
942 # "data -> find_individual\n" if ( $self -> debug );
944 # end find_individual
952 my $header = $self -> {'header'};
954 # format the data for NONMEM (simple comma-separated layout)
955 if ( defined $self -> {'comment'} ) {
956 my @comment = @
{$self -> {'comment'}};
962 my $wrap = ( defined $self -> {'wrap_column'} and
963 defined $self -> {'cont_column'} );
965 my @primary_columns = defined $self -> {'primary_columns'} ?
966 @
{$self -> {'primary_columns'}} : ();
967 my @secondary_columns = defined $self -> {'secondary_columns'} ?
968 @
{$self -> {'secondary_columns'}} : ();
969 if ( defined $header and defined $self -> {'ignoresign'} ) {
971 if ( $self -> {'ignoresign'} ne '@' ) {
972 $istr = $self -> {'ignoresign'};
976 for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
978 for ( my $j = 0; $j < scalar @
{$secondary_columns[$i]} ; $j++ ) {
979 my $jstr = $j == 0 ?
'' : ',';
980 $sstr = $sstr.$jstr.$secondary_columns[$i][$j][0];
982 push( @h_data, $sstr."\n" );
984 push( @form_data, @h_data );
986 for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
987 my $jstr = $i == 0 ?
'' : ',';
988 $pstr = $pstr.$jstr.$primary_columns[$i][0];
990 push( @form_data, $pstr."\n" );
992 push( @form_data, $istr.join(',',@
{$self -> {'header'}})."\n" );
996 foreach my $individual ( @
{$self -> {'individuals'}} ) {
997 foreach my $row ( @
{$individual -> subject_data
} ) {
999 for ( my $i = 0; $i <= $#secondary_columns ; $i++ ) {
1001 for ( my $j = 0; $j < scalar @
{$secondary_columns[$i]} ; $j++ ) {
1002 my $jstr = $j == 0 ?
'' : ',';
1003 if ( $secondary_columns[$i][$j][0] eq 'CONT' ) {
1004 $sstr = $sstr.$jstr.'1';
1006 my @data_row = split( /,/, $row );
1007 $sstr = $sstr.$jstr.$data_row[$secondary_columns[$i][$j][1]];
1010 push( @r_data, $sstr."\n" );
1012 push( @form_data, @r_data );
1014 for ( my $i = 0; $i <= $#primary_columns ; $i++ ) {
1015 my $jstr = $i == 0 ?
'' : ',';
1016 if ( $primary_columns[$i][0] eq 'CONT' ) {
1017 $pstr = $pstr.$jstr.'0';
1019 my @data_row = split( /,/, $row );
1020 $pstr = $pstr.$jstr.$data_row[$primary_columns[$i][1]];
1023 push( @form_data, $pstr."\n" );
1027 foreach my $individual ( @
{$self -> {'individuals'}} ) {
1028 foreach my $row ( @
{$individual -> subject_data
} ) {
1029 push( @form_data, $row ."\n" );
1042 # This method removes columns that has '=DROP' value in the
1043 # model header as given by $INPUT. The model header must be
1044 # transfered to this method through the model_header
1045 # argument. The model_header argument should be a
1046 # two-dimensional array where each position in the first
1047 # dimension should be a reference to a 1*2 array holding the
1048 # column name and value. Any ignore-sign must be removed.
1050 debug
-> die( message
=> 'model header must be defined' )
1051 if ( $#model_header < 0 );
1052 # Important that the drop_dropped method of the model::problem
1053 # class is in sync with this method.
1054 $self -> synchronize
;
1056 $self -> {'header'} = [];
1059 for( my $i = 0; $i <= $#model_header; $i++ ) {
1060 $self -> {'idcolumn'} = $counter if ( $model_header[$i][0] eq 'ID' );
1061 if( ( $model_header[$i][1] eq 'DROP' or
1062 $model_header[$i][1] eq 'SKIP' ) and
1063 not $model_header[$i][0] =~ /DAT(E|1|2|3)/ ) {
1068 push( @
{$self -> {'header'}}, $model_header[$i][0] );
1072 foreach my $individual ( @
{$self -> {'individuals'}} ) {
1073 $individual -> drop_columns
( drop
=> \
@drop );
1076 $self -> {'synced'} = 0;
1077 # $Data::Dumper::Maxdepth = 2;
1079 # die Dumper $self -> {'individuals'};
1088 $self -> synchronize
;
1089 $self -> cont_column
( $cont_column ) if ( defined $cont_column );
1090 $self -> wrap_column
( $wrap_column ) if ( defined $wrap_column );
1091 $self -> prepare_wrap
( model_header
=> \
@model_header );
1092 @secondary_columns = @
{$self -> {'secondary_columns'}}
1093 if ( defined $self -> {'secondary_columns'} );
1094 @primary_columns = @
{$self -> {'primary_columns'}}
1095 if ( defined $self -> {'primary_columns'} );
1104 $self -> {'cont_column'} = undef;
1105 $self -> {'wrap_column'} = undef;
1106 $self -> {'secondary_columns'} = undef;
1107 $self -> {'primary_columns'} = undef;
1116 my $cont_column = $self -> {'cont_column'};
1117 my $wrap_column = $self -> {'wrap_column'};
1118 debug
-> die( message
=> 'cont_column ('.$cont_column.') must be less or equal '.
1119 'to the requested number of columns in each row ('.
1120 ($wrap_column).')' )
1121 if ( $cont_column > $wrap_column );
1123 if ( scalar @model_header > 0 ) {
1124 @header = @model_header;
1126 @header = @
{$self -> {'header'}};
1129 my ( @primary, @secondary, @date_columns );
1131 for ( my $i = 0; $i <= $#header; $i++ ) {
1132 my $name = ref( $header[$i] ) eq 'ARRAY' ?
$header[$i][0] : $header[$i];
1133 my $value = ref( $header[$i] ) eq 'ARRAY' ?
$header[$i][1] : undef;
1134 next if ( $name eq 'ID' );
1136 foreach my $prim ( @primary_column_names ) {
1138 ( $name eq $prim or $value eq $prim ) ) {
1139 push( @primary, [$name, $i, $value] );
1141 my $col = ($#primary+2)>= $cont_column ?
($#primary+3) : ($#primary+2);
1142 push( @date_columns, $col ) if ( $name =~ /DAT(E|1|2|3)/ );
1145 push( @secondary, [$name, $i, $value] ) if ( not $found );
1148 my $prim_num = scalar @primary;
1149 debug
-> die( message
=> 'The number of primary columns (that need to '.
1150 'be part of the row with CONT=0) ('.($prim_num+1).
1151 ') is larger than the required number of columns (wrap_column='.
1152 $wrap_column.') - 1' )
1153 if ( scalar $prim_num > ($wrap_column-2) );
1155 my ( $i, $dum ) = ( 0, 1 );
1157 for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1159 push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1160 } elsif ( $j == $wrap_column ) {
1161 if ( $j == $cont_column ) {
1162 push( @tmp, ['CONT', undef] );
1165 if ( defined $primary[$i] ) {
1166 $val = $primary[$i];
1167 } elsif ( defined $secondary[0] ) {
1168 $val = shift(@secondary);
1170 $val = ['XX'.$dum++,$self -> {'idcolumn'}-1];
1175 push( @
{$self -> {'primary_columns'}}, @tmp );
1177 if ( $j == $cont_column ) {
1178 push( @tmp, ['CONT', undef] );
1180 if ( $i <= $#primary ) {
1181 push( @tmp, $primary[$i] );
1184 my $val = defined $secondary[0] ?
shift(@secondary) :
1185 ['XX'.$dum++,$self -> {'idcolumn'}-1];
1193 while ( $i <= $#secondary ) {
1195 for ( my $j = 1; $j <= $wrap_column; $j++ ) {
1197 push( @tmp, ['ID', $self -> {'idcolumn'}-1] );
1198 } elsif ( $j == $wrap_column ) {
1199 if ( $j == $cont_column ) {
1200 push( @tmp, ['CONT', undef] );
1202 my $val = defined $secondary[$i] ?
$secondary[$i] :
1203 ['XX'.$dum++,$self -> {'idcolumn'}-1];
1207 unshift( @
{$self -> {'secondary_columns'}}, \
@tmp );
1209 if ( $j == $cont_column ) {
1210 push( @tmp, ['CONT', undef] );
1213 if ( $#date_columns >= 0 ) {
1214 foreach my $col ( @date_columns ) {
1215 # This is a date column which may have to be dropped
1216 # and thus will not appear as a secondary
1217 # column. Nothing should be pushed. The indexes in
1218 # model::problem::pk::_format_record will be ok.
1219 $isdate = 1 if ( $col == $j ) ;
1223 push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1225 if ( $i <= $#secondary ) {
1226 push( @tmp, $secondary[$i] );
1229 push( @tmp, ['XX'.$dum++,$self -> {'idcolumn'}-1] );
1241 # {{{ have_missing_data
1242 start have_missing_data
1244 # Either I<column> or I<column_head> must be specified.
1246 # This method looks through the data column with index I<column> or
1247 # (optional) header name I<column_head> and returns O if no missing
1248 # data indicator was found or 1 otherwise.
1250 $self -> synchronize
;
1251 my $first_id = $self -> {'individuals'}[0];
1252 debug
-> die( message
=> "No individuals defined in data object based on ".
1253 $self -> full_name
) unless ( defined $first_id );
1254 my @data_row = split( /,/ , $first_id -> subject_data
-> [0] );
1255 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1256 unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1257 die "Error in data -> have_missing_data: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1259 $column = $self -> {'column_head_indices'}{$column_head};
1262 $self -> flush
if ( $self -> {'target'} eq 'disk' );
1264 # In case anyone wonders, the ternary statment ( bool ? true :
1265 # false ) below will possibly make a minuscle memory
1266 # optimization. But hey, why not :)
1268 $return_value = defined $self -> {'have_missing_data'} ?
$self -> {'have_missing_data'} -> {$column} : 0;
1270 end have_missing_data
1271 # }}} have_missing_data
1276 #$self -> synchronize;
1277 push( @
{$self -> {'individuals'}}, @
{$mergeobj -> individuals
} );
1286 # Either column or column_head must be specified. Column_head must be a string that
1287 # identifies a column in the (optional ) data file header.
1289 # The if-statement below used to be a cache of allready calculated
1290 # means. But since individuals can be accessed in so many ways, we
1291 # don't know when this cache should be updated. Its easier to
1292 # recalculate the max. Maybe we can include this optimization in the
1293 # future, if it turns out to be a bottleneck
1294 # my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1295 # if ( defined $self -> {'max'}[$tmp_column] ) {
1296 # $return_value = $self -> {'max'}[$tmp_column] ;
1298 $self -> synchronize
;
1299 my $first_id = $self -> {'individuals'}[0];
1300 debug
-> die( message
=> "data -> max: No individuals defined in data object based on " .
1301 $self -> full_name
) unless defined $first_id;
1303 my @data_row = split( /,/ , $first_id -> subject_data
->[0] );
1305 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1306 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1307 die "Error in data -> max: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1309 $column = $self -> {'column_head_indices'}{$column_head};
1312 foreach my $individual ( @
{$self -> {'individuals'}} ) {
1313 my $ifactors = $individual -> factors
( 'column' => $column );
1314 foreach ( keys %{$ifactors} ) {
1315 next if ( $_ == $self -> {'missing_data_token'} );
1316 if ( defined ($return_value) ) {
1317 $return_value = $_ > $return_value ?
$_ : $return_value;
1324 # $self -> {'max'}[$column] = $return_value;
1325 $self -> flush
if ( $self -> {'target'} eq 'disk' );
1337 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1339 # The if-statement below used to be a cache of allready calculated
1340 # means. But since individuals can be accessed in so many ways, we
1341 # don't know when this cache should be updated. Its easier to
1342 # recalculate the min. Maybe we can include this optimization in the
1343 # future, if it turns out to be a bottleneck
1344 # if ( defined $self -> {'min'}[$tmp_column] ) {
1345 # $return_value = $self -> {'min'}[$tmp_column] ;
1347 $self -> synchronize
;
1348 my $first_id = $self -> {'individuals'}[0];
1349 die "data -> min: No individuals defined in data object based on ",
1350 $self -> full_name
,"\n" unless defined $first_id;
1352 my @data_row = split( /,/ , $first_id -> subject_data
->[0] );
1354 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1355 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1356 die "Error in data -> min: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1358 $column = $self -> {'column_head_indices'}{$column_head};
1361 foreach my $individual ( @
{$self -> {'individuals'}} ) {
1362 my $ifactors = $individual -> factors
( 'column' => $column );
1363 foreach ( keys %{$ifactors} ) {
1364 next if ( $_ == $self -> {'missing_data_token'} );
1365 if ( defined ($return_value) ) {
1366 $return_value = $_ < $return_value ?
$_ : $return_value;
1372 # $self -> {'min'}[$column] = $return_value;
1373 $self -> flush
if ( $self -> {'target'} eq 'disk' );
1385 $self -> synchronize
;
1386 my $first_id = $self -> {'individuals'}[0];
1387 die "data -> median: No individuals defined in data object based on ",
1388 $self -> full_name
,"\n" unless defined $first_id;
1390 my @data_row = split( /,/ , $first_id -> subject_data
->[0] );
1392 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1393 unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1394 die "Error in data -> median: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1396 $column = $self -> {'column_head_indices'}{$column_head};
1400 if( defined $self -> {'median'}[$column] ){
1401 return $self -> {'median'}[$column];
1406 foreach my $individual ( @
{$self -> {'individuals'}} ) {
1407 if( $unique_in_individual ){
1408 my $ifactors = $individual -> factors
( 'column' => $column );
1410 foreach ( keys %{$ifactors} ) {
1411 next if ( $_ == $self -> {'missing_data_token'} );
1412 push( @median_array, $_ );
1415 my $ifactors = $individual -> subject_data
;
1417 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1418 my @data_row = split( /,/ , $ifactors -> [$i] );
1419 next if ( $data_row[$column-1] == $self -> {'missing_data_token'} );
1420 push(@median_array, $data_row[$column-1]);
1424 @median_array = sort {$a <=> $b} @median_array ;
1426 if( @median_array % 2 ){
1427 $return_value = $median_array[$#median_array / 2];
1429 $return_value = ( $median_array[@median_array / 2] +
1430 $median_array[(@median_array - 2) / 2] ) / 2;
1433 $self -> {'median'}[$column] = $return_value;
1443 # Returns mean value of a column
1444 # If a individual contains more then 1 value (i.e. if an
1445 # individual has different values in different samples a mean
1446 # value of all individuals if calculate first, then the mean
1447 # value of the column If hi_cutoff is defined the mean function
1448 # will cut all value below the cutoff, and set their value to
1449 # 0. It's used to calculate the HI-mean/LOW-mean of a column for
1450 # e.g. Hockey-stick covariates If both hi_cutoff and low_cutoff
1451 # are defined only the hi_cutoff will be used. See L</max>.
1452 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1453 $self -> synchronize
;
1454 my $first_id = $self -> {'individuals'}[0];
1455 die "data -> mean: No individuals defined in data object based on ",
1456 $self -> full_name
,"\n" unless defined $first_id;
1458 my @data_row = split( /,/, $first_id -> subject_data
->[0] );
1460 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1461 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1462 die "Error in data -> mean: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
1464 $column = $self -> {'column_head_indices'}{$column_head};
1468 ## Here the calculation starts
1469 my $num_individuals = 0;
1472 my $all_data_rows=0;
1473 foreach my $individual ( @
{$self ->{'individuals'}} ) {
1475 my $ifactors = $individual -> subject_data
;
1476 my $individual_sum = 0;
1478 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1480 # data is stored in strings. We need to split them into an
1483 my @data_row = split( /,/, $ifactors -> [$i] );
1484 if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1485 # print "Skipping row with missing data\n";
1489 if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1490 # print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1494 if (defined $hi_cutoff) {
1495 if ($data_row[$column-1]>$hi_cutoff) {
1496 $individual_sum += $data_row[$column-1]-$hi_cutoff;
1500 if (defined $low_cutoff) {
1501 if ($data_row[$column-1]<$low_cutoff) {
1502 $individual_sum += $low_cutoff - $data_row[$column-1];
1506 $individual_sum += $data_row[$column-1];
1511 if( $global_mean ) {
1512 $sum += $individual_sum;
1513 $num_individuals += $data_rows;
1515 if( $data_rows != 0 ) {
1516 $sum += $individual_sum/$data_rows;
1518 $num_individuals ++;
1520 $all_data_rows += $data_rows;
1522 if( $num_individuals != 0 ) {
1523 $return_value = $sum / $num_individuals;
1525 # print "DR: $all_data_rows\n";
1526 # print "\nNIM: $num_individuals $return_value\n";
1537 # This sub returns standard deviation for a specific column
1538 # If there are more than one sample/individual the value used for that specific
1539 # individual is the mean value of its samples.
1540 # The cut-offs are for hockey stick variables. I.e. If one individual value is
1541 # lower than the hi-cutoff the individual value will be zero.
1542 # HI_cutoff is used to calculate the HI-mean of a column.
1543 # If cut_off is undef it won't be used
1545 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1546 $self -> synchronize
;
1547 my $first_id = $self -> {'individuals'}[0];
1548 debug
-> die( message
=> "No individuals defined in data object based on ".
1549 $self -> full_name
) unless defined $first_id;
1551 my @data_row = split( /,/ , $first_id -> subject_data
->[0] );
1553 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1554 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
1555 debug
-> die( message
=> "Unknown column: \"$column_head\" or "
1556 ."invalid column number: \"$column\"" );
1558 $column = $self -> {'column_head_indices'}{$column_head};
1562 ## Here the calculation starts
1563 my $num_individuals = 0;
1566 if (defined $hi_cutoff) {
1567 $mean = $self->mean(column
=> $column,
1568 hi_cutoff
=> $hi_cutoff,
1569 global_mean
=> $global_sd );
1570 } elsif (defined $low_cutoff) {
1571 $mean = $self->mean(column
=> $column,
1572 low_cutoff
=> $low_cutoff,
1573 global_mean
=> $global_sd );
1575 $mean = $self->mean( column
=> $column,
1576 subset_column
=> $subset_column,
1577 subset_syntax
=> $subset_syntax,
1578 global_mean
=> $global_sd );
1581 foreach my $individual ( @
{$self -> {'individuals'}} ) {
1582 my $ifactors = $individual -> subject_data
;
1583 my $individual_sum = 0;
1585 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
1587 # data is stored in strings. We need to split them into an
1590 my @data_row = split( /,/, $ifactors -> [$i] );
1592 if ( $data_row[$column-1] == $self -> {'missing_data_token'} ) {
1593 # print "Skipping row with missing data\n";
1597 if( defined $subset_column and not eval ( $data_row[$subset_column-1].$subset_syntax ) ) {
1598 # print "Skipping row outside subset: syntax: ".($subset_column-1)." $subset_syntax\n";
1602 if (defined $hi_cutoff) {
1603 if ($data_row[$column-1]>$hi_cutoff) {
1605 $individual_sum += ($data_row[$column-1] - $hi_cutoff - $mean) ** 2;
1607 $individual_sum += $data_row[$column-1]-$hi_cutoff;
1611 if (defined $low_cutoff) {
1612 if ($data_row[$column-1]<$low_cutoff) {
1614 $individual_sum += ($low_cutoff - $data_row[$column-1] - $mean) ** 2;
1616 $individual_sum += $low_cutoff - $data_row[$column-1];
1621 $individual_sum += ($data_row[$column-1] - $mean) ** 2;
1623 $individual_sum += $data_row[$column-1];
1630 $sum += $individual_sum;
1631 $num_individuals += $data_rows;
1633 if( $data_rows != 0 ) {
1634 $sum += ($individual_sum/$data_rows - $mean) ** 2;
1639 if( $num_individuals < 2 ) {
1642 if( $num_individuals != 0 ) {
1643 $return_value = (1/($num_individuals-1)*$sum) ** 0.5;
1656 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
1657 if ( defined $self -> {'range'}[$tmp_column] ) {
1658 $return_value = $self -> {'range'}[$tmp_column];
1660 my $old_target = $self -> {'target'};
1661 $self -> {'target'} = 'mem';
1662 $self -> synchronize
;
1663 $return_value = $self -> max
( column
=> $column,
1664 column_head
=> $column_head ) -
1665 $self -> min
( column
=> $column,
1666 column_head
=> $column_head );
1667 $self -> {'range'}[$column] = $return_value;
1668 if ( $old_target eq 'disk' ) {
1669 $self -> flush
if ( $self -> {'target'} eq 'disk' );
1670 $self -> {'target'} = 'disk';
1680 # Recalculates a column based on expression. Also, see L</max>.
1681 $self -> synchronize
;
1683 # Check if $column(-index) is defined and valid, else try to find index using column_head
1684 my $first_id = $self -> {'individuals'}[0];
1685 die "data -> recalc_column: No individuals defined in data object based on ",
1686 $self -> full_name
,"\n" unless defined $first_id;
1688 my @data_row = split( /,/ , $first_id -> subject_data
->[0] );
1690 unless ( defined $column && defined( $data_row[$column-1] ) ) {
1691 if(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
1692 die "Error in data -> recalc_column: unknown column: \"$column_head\" or column number: \"$column\"\n";
1694 $column = $self -> {'column_head_indices'}{$column_head};
1698 for my $individual ( @
{$self -> {'individuals'}} ) {
1699 $individual -> recalc_column
( column
=> $column,
1700 expression
=> $expression );
1706 # {{{ renumber_ascending
1708 start renumber_ascending
1710 # Renumbers the individuals (changes the subject identifiers) so that
1711 # all have unique integer numbers starting with start_at and
1712 # ascending. The primary use of this
1713 # method is not to order the individuals after their identifiers but to
1714 # ensure that all individuals have unique identifiers.
1716 $self -> synchronize
;
1717 foreach my $individual ( @
{$self -> {'individuals'}} ) {
1718 $individual -> idnumber
( $start_at++ );
1720 $self -> {'synced'} = 0;
1722 end renumber_ascending
1724 # }}} renumber_ascending
1726 # {{{ renumber_descending
1728 start renumber_descending
1730 # See L</renumber_ascending>.
1731 $self -> synchronize
;
1732 foreach my $individual ( @
{$self -> {'individuals'}} ) {
1733 $individual -> idnumber
( $start_at-- );
1735 $self -> {'synced'} = 0;
1737 end renumber_descending
1739 # }}} renumber_descending
1741 # {{{ single_valued_data
1743 start single_valued_data
1747 # ($single_value_data_set, $remainder, $column_indexes) =
1748 # $data_object -> single_valued_data( subset_name => 'subset.dta',
1749 # remainder_name => 'remainder.dta',
1751 # do_not_test_columns => [1..18,24,26];
1753 # my $single_value_column_indexes = $column_indexes -> [0];
1754 # my $all_other_column_indexes = $column_indexes -> [1];
1756 # Analyses the content of each column, based on the
1757 # ID column, and returns two new data objects: One
1758 # that contains all columns that is has only one value per
1759 # individual and one that contains the
1760 # remainding data. This is useful for creating compact 'extra'
1761 # data sets that can be read in via user-defined sub-routines
1762 # when the number of columns needed exceeds the maximum that
1763 # NONMEM allows (e.g. 20 in NONMEM version V).
1765 # The I<do_not_test_columns> argument specifies on which columns
1766 # to skip the single value test
1768 my @multi_value_flags;
1769 my @individuals = @
{$self -> {'individuals'}};
1770 # Initiate the flags:
1771 if ( defined $individuals[0] ) {
1772 my @data = @
{$individuals[0] -> {'subject_data'}};
1773 my @data_row = split( /,/ , $data[0] );
1774 for ( my $i = 0; $i < scalar @data_row; $i++ ) {
1776 foreach my $dntc ( @do_not_test_columns ) {
1777 $dnt_flag = 1 if ( $i == $dntc - 1 );
1779 $multi_value_flags[$i] = $dnt_flag;
1782 die "data -> single_valued_data: No data in ID number 1\n";
1785 for ( my $id = 0; $id <= $#individuals; $id++ ) {
1786 my @data = @
{$individuals[$id] -> {'subject_data'}};
1787 my @data_row = split( /,/, $data[0] );
1788 for ( my $j = 0; $j < scalar @data_row; $j++ ) {
1790 for ( my $i = 0; $i <= $#data; $i++ ) {
1791 my @data_row = split( /,/ , $data[$i] );
1792 $col_unique{$data_row[$j]}++;
1794 my $factors = scalar keys %col_unique;
1795 $multi_value_flags[$j]++ if ( $factors > 1 );
1798 for ( my $i = 0; $i <= $#multi_value_flags; $i++ ) {
1799 if ( $multi_value_flags[$i] ) {
1800 push ( @
{$column_indexes[1]}, $i + 1);
1802 push ( @
{$column_indexes[0]}, $i + 1);
1805 ( $single_value_data_set, $remainder ) =
1806 $self -> subset_vertically
( column_indexes
=> $column_indexes[0],
1807 subset_name
=> $subset_name,
1808 return_remainder
=> 1,
1809 remainder_name
=> $remainder_name,
1811 keep_first_row_only
=> 1);
1813 end single_valued_data
1817 # {{{ subset_vertically
1819 start subset_vertically
1823 # $subset = $data_object -> subset_vertically ( column_indexes => [1,2,6],
1824 # subset_name => 'subset.dta' );
1826 # This basic usage returns a new data object containing
1827 # columns 1,2 and 6 from the original data plus the
1828 # idcolumn. The new data object will be associated with the
1829 # file 'subset.dta'.
1831 # You get the remaining data, i.e. the original data minus
1832 # the created subset by specifying
1834 # ( $subset, $remainder ) =
1835 # $data_object -> subset_vertically ( column_indexes => [1,2,6],
1836 # subset_name => 'subset.dta',
1837 # return_remainder => 1,
1838 # remainder_name => 'remainder.dta' );
1840 # If you would like to flush the created data sets to disk and
1841 # save memory, set the I<target> argument to 'disk'. The
1842 # default value 'mem' will keep the whole data object in RAM.
1844 # The I<keep_first_row_only> argument can be used to reduce
1845 # the size of the subset data obejct by excluding all but the
1846 # first row of data from each individual.
1848 my @individuals = @
{$self -> {'individuals'}};
1849 # Create remainder index array if necessary
1850 my @remainder_indexes;
1851 if ( defined $individuals[0] ) {
1852 my @data = @
{$individuals[0] -> {'subject_data'}};
1853 my $idcolumn = $individuals[0] -> {'idcolumn'};
1854 # print "IC: $idcolumn\n";
1856 foreach my $use_index ( @column_indexes ) {
1857 $id_flag = 1 if ( $use_index == $idcolumn );
1859 if ( $return_remainder ) {
1860 # @remainder_indexes = ( $idcolumn );
1861 for ( my $i = 0; $i < scalar split(/,/,$data[0]); $i++ ) {
1863 foreach my $use_index ( @column_indexes ) {
1864 $rem_flag = 0 if ( $i == $use_index -1 );
1866 # $i == $idcolumn -1 );
1868 push( @remainder_indexes, $i + 1 ) if ( $rem_flag );
1870 unshift( @remainder_indexes, $idcolumn ) if ( $id_flag );
1872 unshift( @column_indexes, $idcolumn ) unless ( $id_flag );
1874 die "data -> single_valued_data: No data in ID number 1\n";
1879 for ( my $id = 0; $id <= $#individuals; $id++ ) {
1880 my $idnumber = $individuals[$id] -> idnumber
;
1881 my $idcolumn = $individuals[$id] -> idcolumn
;
1882 my @data = @
{$individuals[$id] -> {'subject_data'}};
1885 my $use_rows = $keep_first_row_only ?
0 : $#data;
1886 for ( my $i = 0; $i <= $use_rows; $i++ ) {
1888 my @data_row = split( /,/, $data[$i] );
1889 foreach my $use_index ( @column_indexes ) {
1890 push( @new_row, $data_row[$use_index-1] );
1892 # print "@new_row $#new_row\n";
1893 push( @new_data, join( ',', @new_row ) );
1895 for ( my $i = 0; $i <= $#data; $i++ ) {
1896 if ( $return_remainder ) {
1898 my @data_row = split( /,/, $data[$i] );
1899 foreach my $use_index ( @remainder_indexes ) {
1900 push( @new_row_2, $data_row[$use_index-1] );
1902 # print "@new_row_2 $#new_row_2\n";
1903 push( @new_data_2, join( ',' , @new_row_2 ) );
1906 my $new_id = data
::individual
-> new
( idnumber
=> $idnumber,
1907 idcolumn
=> $idcolumn,
1908 subject_data
=> \
@new_data );
1909 push( @new_ids, $new_id );
1910 if ( $return_remainder ) {
1912 $new_id_2 = data
::individual
-> new
( idnumber
=> $idnumber,
1913 idcolumn
=> $idcolumn,
1914 subject_data
=> \
@new_data_2 );
1915 push( @new_ids_2, $new_id_2 );
1918 my @header = @
{$self -> {'header'}};
1920 foreach my $use_index ( @column_indexes ) {
1921 push( @new_header, @header[$use_index-1] );
1924 if( defined $self -> {'comment'} ){
1925 my @comment = @
{$self -> {'comment'}};
1926 $comment = \
@comment;
1928 $subset = data
-> new
( filename
=> $subset_name,
1929 directory
=> $self -> {'directory'},
1930 ignoresign
=> $self -> {'ignoresign'},
1931 header
=> \
@new_header,
1932 comment
=> $comment,
1933 individuals
=> \
@new_ids,
1935 ignore_missing_files
=> 1 );
1936 if ( $return_remainder ) {
1938 foreach my $use_index ( @remainder_indexes ) {
1939 push( @new_header_2, @header[$use_index-1] );
1941 $remainder = data
-> new
( filename
=> $remainder_name,
1942 directory
=> $self -> {'directory'},
1943 ignoresign
=> $self -> {'ignoresign'},
1944 header
=> \
@new_header_2,
1945 comment
=> $comment,
1946 individuals
=> \
@new_ids_2,
1948 ignore_missing_files
=> 1 );
1951 end subset_vertically
1959 # if ( defined $expression and defined $bins ) {
1960 # die "data -> subset: expression and bins may not both be specified\n";
1962 # if ( not ( defined $expression or defined $bins ) ) {
1963 # die "data -> subset: expression or bins must be specified\n";
1965 $self -> synchronize
;
1966 my @header = @
{$self -> {'header'}};
1967 my @comment = defined $self -> {'comment'} ? @
{$self -> {'comment'}} : ();
1971 my @ids = @
{$self -> {'individuals'}};
1972 if ( defined $stratify_on ) {
1973 my $work_data = $self -> copy
( filename
=> 'work_data.dta',
1975 my %strata = %{$work_data -> factors
( column
=> $stratify_on )};
1976 # $Data::Dumper::Maxdepth = 1;
1977 # print Dumper \%strata;
1979 while ( my ( $factor, $keys ) = each %strata ) {
1980 foreach my $key ( @
{$keys} ) {
1982 while ( defined $rnd_ids{$factor}{$rnd_num} ) {
1985 $rnd_ids{$factor}{$rnd_num} = $ids[$key];
1989 while ( my ( $factor, $rnd_nums ) = each %rnd_ids ) {
1990 my @sort_rnd_nums = sort { $a <=> $b } keys %{$rnd_nums};
1991 for ( my $i = 0; $i <= $#sort_rnd_nums; $i ) {
1992 for ( my $j = 0; $j < $bins; $j++ ) {
1994 push( @subset_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy
] );
1995 push( @incl_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber
] );
1997 push( @
{$subset_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy
);
1998 push( @
{$incl_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber
);
2001 last if $i > $#sort_rnd_nums;
2006 for ( my $j = 0; $j < $bins; $j++ ) {
2007 my $sdata = data
-> new
( header
=> \
@header,
2008 comment
=> \
@comment,
2009 ignoresign
=> $self -> {'ignoresign'},
2010 individuals
=> $subset_ids[$j],
2011 ignore_missing_files
=> 1,
2013 idcolumn
=> $self -> {'idcolumn'},
2014 filename
=> "subset_$j.dta" );
2016 push( @subsets, $sdata );
2019 for ( my $i = 0; $i <= $#ids; $i++ ) {
2021 while ( defined $rnd_ids{$rnd_num} ) {
2024 $rnd_ids{$rnd_num} = $ids[$i];
2026 my @keys = sort { $a <=> $b } keys %rnd_ids;
2028 for ( my $i = 0; $i <= $#keys; $i ) {
2029 for ( my $j = 0; $j < $bins; $j++ ) {
2031 push( @subset_ids, [$rnd_ids{$keys[$i]} -> copy
] );
2032 push( @incl_ids, [$rnd_ids{$keys[$i]} -> idnumber
] );
2034 push( @
{$subset_ids[$j]}, $rnd_ids{$keys[$i]} -> copy
);
2035 push( @
{$incl_ids[$j]}, $rnd_ids{$keys[$i]} -> idnumber
);
2038 last if $i > $#keys;
2042 for ( my $j = 0; $j < $bins; $j++ ) {
2043 my $sdata = data
-> new
( header
=> \
@header,
2044 comment
=> \
@comment,
2045 ignoresign
=> $self -> {'ignoresign'},
2046 individuals
=> $subset_ids[$j],
2047 ignore_missing_files
=> 1,
2049 idcolumn
=> $self -> {'idcolumn'},
2050 filename
=> "subset_$j.dta" );
2052 push( @subsets, $sdata );
2064 $self -> synchronize
;
2065 my @header = @
{$self -> {'header'}};
2066 my @comment = defined $self -> {'comment'} ? @
{$self -> {'comment'}} : ();
2067 my @subset_inds = ();
2069 foreach my $individual ( @
{$self -> {'individuals'}} ) {
2070 if ( $individual -> evaluate_expression
( column
=> $based_on,
2071 expression
=> $expression ) ) {
2072 push( @subset_inds, $individual -> copy
);
2073 push( @incl_individuals, $individual -> idnumber
);
2074 push( @included_keys, $key );
2078 $subset = data
-> new
( header
=> \
@header,
2079 comment
=> \
@comment,
2080 ignoresign
=> $self -> {'ignoresign'},
2081 individuals
=> \
@subset_inds,
2082 idcolumn
=> $self -> {'idcolumn'},
2083 filename
=> "subset.dta" );
2093 if ( $parm eq 'disk' and $self -> {'target'} eq 'mem' ) {
2094 $self -> {'target'} = 'disk';
2096 } elsif ( $parm eq 'mem' and $self -> {'target'} eq 'disk' ) {
2097 $self -> {'target'} = 'mem';
2098 $self -> synchronize
;
2109 die "ERROR: data -> _write: No filename set in data object.\n"
2110 if( $filename eq '' );
2112 # $Data::Dumper::Maxdepth = 2;
2113 # die Dumper $self -> {'individuals'};
2115 if( not defined $self -> {'individuals'} ){
2117 # If we don't have any individuals and write to a new
2118 # filename, we must first read individuals from the old
2119 # file. A call to synchronize will do that. There is no risk
2120 # of a infinite loop here since synchronize allways writes to
2123 unless( $filename eq $self -> full_name
){
2124 $self -> synchronize
;
2128 open(FILE
,">$filename") ||
2129 die "Could not create $filename\n";
2130 my $data_ref = $self -> format_data
;
2131 my @data = @
{$data_ref};
2137 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2138 # $self -> {'use_data_table'} ) {
2139 # # Backslashes messes up the sql syntax
2140 # my $file_str = $self->{'filename'};
2141 # my $dir_str = $self->{'directory'};
2142 # $file_str =~ s/\\/\//g;
2143 # $dir_str =~ s/\\/\//g;
2146 # my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
2147 # my ( $date_str, $time_str );
2148 # if ( $Config{osname} eq 'MSWin32' ) {
2149 # $date_str = `date /T`;
2150 # $time_str = ' '.`time /T`;
2153 # $date_str = `date`;
2157 # my $date_time = $date_str.$time_str;
2158 # my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2159 # ";databse=".$PsN::config -> {'_'} -> {'project'},
2160 # $PsN::config -> {'_'} -> {'user'},
2161 # $PsN::config -> {'_'} -> {'password'},
2163 # 'RaiseError' => 1});
2165 # if ( defined $self -> {'data_id'} ) {
2166 # $sth = $dbh -> prepare( "UPDATE ".$PsN::config -> {'_'} -> {'project'}.
2168 # "SET filename='$file_str',date='$date_time',".
2169 # "directory='$dir_str',md5sum='$md5sum' ".
2170 # "WHERE data_id='".$self -> {'data_id'}."'" );
2171 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2173 # $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2174 # ".data (filename,date,directory,md5sum) ".
2175 # "VALUES ('$file_str', '$date_time', '$dir_str','".
2178 # $self -> {'data_id'} = $sth->{'mysql_insertid'};
2181 # $dbh -> disconnect;
2191 # synchronizes the object with the file on disk and empties
2192 # most of the objects attributes to save memory.
2193 if( defined $self -> {'individuals'} and
2194 ( !$self -> {'synced'} or $force ) ) {
2197 # $self -> {'header'} = undef;
2198 $self -> {'comment'} = undef;
2199 $self -> {'individuals'} = undef;
2200 $self -> {'synced'} = 0;
2201 $self -> {'column_head_indices'} = undef;
2202 $self -> {'have_missing_data'} = undef;
2211 # synchronizes the object with the file on disk
2212 unless( $self -> {'synced'} ){
2213 if( defined $self -> {'individuals'} and
2214 scalar @
{$self -> {'individuals'}} > 0 ){
2215 # We should not read new data from file if we
2216 # have an individuals defined?
2217 # Perhaps there should be an attribute
2218 # 'from_file' that overrides this and reads in
2219 # the data from the file specified in filename
2220 # and overwrites whatever the object already
2222 # if( -e $self -> {'filename'} ){
2223 # $self -> _read_header;
2224 # $self -> _read_individuals;
2228 if( -e
$self -> full_name
){
2229 unless( defined $self -> {'header'} and scalar @
{$self -> {'header'}} > 0 ){
2230 $self -> _read_header
;
2232 $self -> _read_individuals
;
2234 debug
-> die( message
=> "Fatal error: datafile: " . $self -> full_name
. " does not exist." );
2240 foreach my $head ( @
{$self -> {'header'}} ){
2241 $self -> {'column_head_indices'} -> {$head} = $i;
2244 $self -> {'synced'} = 1;
2250 # {{{ _fisher_yates_shuffle
2252 start _fisher_yates_shuffle
2254 my $arr_ref = $parm{'array'};
2255 debug
-> warn( level
=> 1,
2256 message
=> "Array of zero length received" )
2257 if ( scalar @
{$arr_ref} < 1 );
2259 for ($i = @
$arr_ref; --$i; ) {
2260 my $j = random_uniform_integer
(1,0,$i);
2261 # my $j = int rand ($i+1);
2262 # print "$j $j_new\n";
2263 @
$arr_ref[$i,$j] = @
$arr_ref[$j,$i];
2266 end _fisher_yates_shuffle
2268 # }}} _fisher_yates_shuffle
2274 my $filename = $self -> full_name
;
2275 my $ignoresign = $self -> ignoresign
;
2276 my ( @data, @new_record, $row, $tmp_row, @header, $hdrstring );
2278 open(DATAFILE
,"$filename") ||
2279 die "Could not open $filename for reading";
2281 while (<DATAFILE
>) {
2284 # @new_record = split(/\,|\s+/,$_);
2285 if ( ! (/^\s*\d+|^\s*\./) ) {
2286 $data[$row] = $tmp_row;
2289 # We have reached the first data-row, return.
2290 $columns = scalar split(/\,\s*|\s+/);
2296 if ( defined $self -> {'cont_column'} and not $self -> {'table_file'} ) {
2297 my $data_len = $#data;
2298 for ( my $i = $data_len; $i >= 0; $i-- ) {
2299 my @arr = split(/\,\s*|\s+/,$data[$i]);
2300 if ( $arr[$self -> {'cont_column'}-1] eq 'CONT' ) {
2301 my $start = $i == $data_len ?
0 : 1;
2302 for ( my $j = $start; $j <= $#arr; $j++ ) {
2303 if ( $j != ($self -> {'cont_column'}-1) ) {
2304 push( @header, $arr[$j] );
2310 # the \Q and \E here are to escape wierd ignoresigns
2311 $header[0] =~ s/\Q$ignoresign\E//
2312 if ( defined $self->ignoresign );
2313 shift( @header ) if ( $header[0] eq "" );
2315 chomp( $hdrstring = pop(@data));
2316 @header = split(/\,\s*|\s+/,$hdrstring);
2317 # the \Q and \E here are to escape wierd ignoresigns
2318 $header[0] =~ s/\Q$ignoresign\E//
2319 if ( defined $self->ignoresign );
2320 shift( @header ) if ( $header[0] eq "" );
2321 if( $self -> {'table_file'} ) {
2323 for( my $i = 1; $i <= scalar @header; $i++ ) {
2324 if( $header[$i-1] eq 'CONT' ) {
2325 if ( defined $self -> {'cont_column'} and not $i == $self -> {'cont_column'} ) {
2326 debug
-> warn( level
=> 1,
2327 message
=> "The supplied columns for the CONT data item (".
2328 $self -> {'cont_column'}.") does not match the column where the CONT ".
2329 "header was found ($i), using $i" );
2331 $self -> {'cont_column'} = $i;
2333 push( @new_header, $header[$i-1] );
2336 @header = @new_header;
2337 for( my $i = 1; $i <= scalar @header; $i++ ) {
2338 if( $header[$i-1] eq 'ID' ) {
2339 if ( defined $self -> {'idcolumn'} and not $i == $self -> {'idcolumn'} ) {
2340 debug
-> warn( level
=> 1,
2341 message
=> "The supplied columns for the ID data item (".
2342 $self -> {'idcolumn'}.") does not match the column where the CONT ".
2343 "header was found ($i), using $i" );
2345 $self -> {'idcolumn'} = $i;
2351 # I'm not certain on how to deal with this conflict. I'm leaving it commented because I believe this code should not be here.
2353 #<<<<<<< data_subs.pm
2354 # $header[0] =~ s/$ignoresign//
2355 # if ( defined $self->ignoresign );
2356 # shift( @header ) if ( $header[0] eq "" );
2360 # It is ok with data sets without a header.
2361 # unless( scalar @header > 0 ){ debug -> die( message => 'Datafile ' . $self -> full_name . ' is empty.' ); }
2363 $self -> {'header'} = \
@header;
2364 $self -> {'comment'} = \
@data;
2365 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2366 # $self -> {'use_data_table'} ) {
2367 # my $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2368 # ";databse=".$PsN::config -> {'_'} -> {'project'},
2369 # $PsN::config -> {'_'} -> {'user'},
2370 # $PsN::config -> {'_'} -> {'password'},
2371 # {'RaiseError' => 1});
2372 # if ( scalar @header < 1 ) {
2373 # for ( my $i = 1; $i <= $columns; $i++ ) {
2374 # push( @header, $i );
2377 # for ( my $i = 0; $i <= $#header; $i++ ) {
2378 # my $sth = $dbh -> prepare("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2380 # "(name,number,data_id) ".
2381 # "VALUES ('".$header[$i]."', '".($i+1).
2382 # "', '".$self -> {'data_id'}."' )");
2384 # push( @{$self -> {'data_column_ids'}}, $sth->{'mysql_insertid'} );
2387 # $dbh -> disconnect;
2394 # {{{ _read_individuals
2396 start _read_individuals
2398 my $idcol = $self -> idcolumn
;
2399 my $filename = $self -> full_name
;
2400 #debug -> warn( level => 1,
2401 # message => "Building array of individuals from file " . $self -> {'filename'} );
2402 open(DATAFILE
,"$filename") ||
2403 die "Could not open $filename for reading";
2404 my ( @new_row, $new_ID, $old_ID, @init_data );
2407 while (sysread DATAFILE
, $buffer, 4096) {
2408 $lines += ($buffer =~ tr/\n//);
2410 seek( DATAFILE
, 0,0 );
2413 my $status_bar = status_bar
-> new
( steps
=> $lines );
2415 ui
-> print( category
=> 'scm',
2416 message
=> "Reading data file: ".$self -> filename
);
2417 ui
-> print( category
=> 'scm',
2418 message
=> $status_bar -> print_step
(),
2421 my ( $sth, $dbh, $first_row_id, $first_value_id );
2423 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2424 # $self -> {'use_data_table'} ) {
2425 # $dbh = DBI -> connect("DBI:mysql:host=".$PsN::config -> {'_'} -> {'database_server'}.
2426 # ";databse=".$PsN::config -> {'_'} -> {'project'},
2427 # $PsN::config -> {'_'} -> {'user'},
2428 # $PsN::config -> {'_'} -> {'password'},
2429 # {'RaiseError' => 1});
2430 # my $sth = $dbh -> prepare( "SELECT data_row_id FROM ".$PsN::config -> {'_'} -> {'project'}.
2432 # "WHERE data_id='".$self -> {'data_id'}."'" );
2433 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2434 # my $select_arr = $sth -> fetchall_arrayref;
2435 # if ( scalar @{$select_arr} > 0 ) {
2436 # for ( my $i = 0; $i < scalar @{$select_arr}; $i++ ) {
2437 # push( @{$self -> {'data_row_ids'}}, $select_arr->[$i][0] );
2439 # $sth = $dbh -> prepare( "SELECT data_value_id FROM ".$PsN::config -> {'_'} -> {'project'}.
2441 # "WHERE data_id='".$self -> {'data_id'}."'" );
2442 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2443 # my $select_val = $sth -> fetchall_arrayref;
2444 # for ( my $i = 0; $i < scalar @{$select_val}; $i++ ) {
2445 # push( @{$self -> {'data_value_ids'}}, $select_val->[$i][0] );
2448 # $dbh -> disconnect;
2450 # $dbh -> do( "LOCK TABLES ".$PsN::config -> {'_'} -> {'project'}.
2451 # ".data_row WRITE, ".$PsN::config -> {'_'} -> {'project'}.
2452 # ".data_value WRITE" );
2453 # $sth = $dbh -> prepare( "SELECT MAX(data_row_id) FROM ".$PsN::config -> {'_'} -> {'project'}.
2455 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2456 # my $select_arr = $sth -> fetchall_arrayref;
2457 # $first_row_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
2458 # $sth = $dbh -> prepare( "SELECT MAX(data_value_id) FROM ".$PsN::config -> {'_'} -> {'project'}.
2460 # $sth -> execute or debug -> die( message => $sth->errstr ) ;
2461 # my $select_arr = $sth -> fetchall_arrayref;
2462 # $first_value_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
2469 my $row_counter = 0;
2471 ROW
: while ( <DATAFILE
> ) {
2474 my @new_row = split(/\,\s*|\s+/);
2475 # This regexp check is not time consuming.
2476 if ( /^\s*\d+|^\s*\./ ) {
2477 if ( defined $self -> {'cont_column'} ) {
2478 if ( $new_row[$self -> {'cont_column'} - 1] == 1 ) {
2479 if ( not $self -> {'table_file'} ) { # Skip the CONT=1 rows if this is a table file
2480 for ( my $i = $#new_row; $i > 0; $i-- ) {
2481 if ( $i != ($self -> {'cont_column'} - 1) ) {
2482 unshift( @
{$full_row}, $new_row[$i] );
2488 for ( my $i = $#new_row; $i >= 0; $i-- ) {
2489 # if ( $i != ($self -> {'cont_column'} - 1) or $self -> {'table_file'} ) {
2490 if ( $i != ($self -> {'cont_column'} - 1) ) {
2491 unshift( @
{$full_row}, $new_row[$i] );
2496 @
{$full_row} = @new_row;
2498 $new_ID = $full_row -> [$idcol-1]; # index starts at 0
2499 $old_ID = $new_ID if ( not defined $old_ID );
2501 # Check if column miss data at some row (This adds about 30% of init time)
2502 my $mdt = $self -> {'missing_data_token'};
2503 for( my $i = 0; $i <= $#{$full_row}; $i++ ){
2504 $self -> {'have_missing_data'} -> {$i+1} = 1
2505 if( $full_row -> [$i] == $mdt ); # == is slower but safer than eq
2507 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2508 # $self -> {'use_data_table'} and $insert ) {
2510 # $insert_rows = $insert_rows."," if ( defined $insert_rows );
2511 # $insert_rows = $insert_rows.
2512 # "('$row_counter', '".$self -> {'data_id'}."' )";
2513 # for ( my $j = 0; $j <= $#{$full_row}; $j++ ) {
2514 # $insert_values = $insert_values."," if ( defined $insert_values );
2515 # $insert_values = $insert_values.
2516 # "('".$full_row -> [$j]."', '".
2517 # ($first_row_id+$row_counter)."', '".
2518 # $self -> {'data_column_ids'}->[$j].
2519 # "', '".$self -> {'data_id'}."' )";
2523 if ( $new_ID != $old_ID ) {
2524 my @subject_data = @init_data;
2525 my $id = data
::individual
-> new
( idcolumn
=> $idcol,
2526 subject_data
=> \
@subject_data,
2527 data_id
=> $self -> {'data_id'} );
2528 push( @
{$self -> {'individuals'}}, $id );
2529 @init_data =(join( ",", @
{$full_row}));
2531 push( @init_data, join( ",", @
{$full_row}) );
2536 if ( $status_bar -> tick
() ) {
2537 ui
-> print( category
=> 'scm',
2538 message
=> $status_bar -> print_step
(),
2544 # if ( $PsN::config -> {'_'} -> {'use_database'} and
2545 # $self -> {'use_data_table'} and $insert ) {
2546 # $dbh -> do("INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2548 # "(number,data_id) ".
2549 # "VALUES ".$insert_rows);
2550 # push( @{$self -> {'data_row_ids'}}, ($first_row_id..$first_row_id+$row_counter) );
2551 # $dbh -> do( "INSERT INTO ".$PsN::config -> {'_'} -> {'project'}.
2553 # "(value,data_row_id,data_column_id,data_id) ".
2554 # "VALUES ".$insert_values );
2555 # push( @{$self -> {'data_value_ids'}},
2556 # ($first_value_id..$first_value_id+($row_counter*
2557 # scalar @{$self->{'data_column_ids'}})));
2558 # $dbh -> do( "UNLOCK TABLES" );
2559 # $dbh -> disconnect;
2562 if ( $#init_data >= 0 ) {
2563 push( @
{$self -> {'individuals'}},
2564 data
::individual
-> new
( idcolumn
=> $idcol,
2565 subject_data
=> \
@init_data ) );
2567 ui
-> print( category
=> 'scm',
2568 message
=> " ... done" );
2570 # $self -> _write( filename => 'test.dta' );
2572 end _read_individuals
2574 # }}} _read_individuals