*** empty log message ***
[PsN.git] / lib / data_subs.pm
blob1482511bee98217f812f8611518d386c586d511a
1 This file contains the code for the subroutines in
2 Perl-speaks-NONMEMs data module. It is not functional by itself.
3 The code should be transferred to the module autogenerated by dia2code
4 using the fill_diacode.pl script.
6 TODO: Fix the synced attribute so that all methods that change the state of the
7 data object sets synced=0
9 # {{{ include
11 start include statements
12 use Digest::MD5 'md5_hex';
13 use OSspecific;
14 use File::Copy "cp";
15 use Carp;
16 use Carp qw(cluck);
17 use Config;
18 use Math::Random;
19 use debug;
20 use ui;
21 use Data::Dumper;
22 use Time::HiRes qw(gettimeofday);
23 end include
25 # }}} include statements
27 start description
28 # The structure of the data class is subject-centric, recognising that
29 # the subjects included in a study often can be regarded as
30 # independent. A class for the subject level exists within PsN and is
31 # called the individual class. A data object consists of at least one
32 # but probably many individual objects plus optional comments.
33 end description
35 start synopsis
36 # use data;
38 # my $data_obj = data -> new ( filename => 'test040314.dta' );
40 # $data_obj -> renumber_ascending;
42 # my $subsets_ref = $data_obj -> case_deletion( bins => 10 );
44 # my @subsets = @{$subsets_ref};
45 end synopsis
47 start see_also
48 # =begin html
50 # <a HREF="model.html">model</a>, <a HREF="output.html">output</a>,
51 # <a HREF="tool/modelfit.html">tool::modelfit</a>,
52 # <a HREF="tool.html">tool</a>
54 # =end html
56 # =begin man
58 # model, output, tool::modelfit, tool
60 # =end man
61 end see_also
64 # {{{ new
66 start new
68 # If the column holding the subject identifier is not the
69 # first, it can be specified using the I<idcolumn> attribute
71 # I<ignoresign> determines which rows that are regarded as
72 # comments. Corresponds to the IGNORE= option in the $DATA
73 # record in a NONMEM model file.
75 ( $this -> {'directory'},
76 $this -> {'filename'} ) = OSspecific::absolute_path( $this -> {'directory'},
77 $this->{'filename'} );
79 debug -> warn( level => 2,
80 message => "data -> new: Data object initialized from file: ".
81 $this -> full_name );
83 sub register_in_database {
84 # Backslashes messes up the sql syntax
85 my $file_str = $this->{'filename'};
86 my $dir_str = $this->{'directory'};
87 $file_str =~ s/\\/\//g;
88 $dir_str =~ s/\\/\//g;
90 # md5sum
91 my $md5sum = md5_hex(OSspecific::slurp_file($this-> full_name ));
92 my $dbh = DBI -> connect("DBI:mysql:host=localhost;databse=psn",
93 "psn", "psn_test",
94 {'RaiseError' => 1});
95 my $sth;
96 my $sth = $dbh -> prepare( "SELECT data_id FROM psn.data ".
97 "WHERE filename = '$file_str' AND ".
98 "directory = '$dir_str' AND ".
99 "md5sum = '".$md5sum."'" );
100 $sth -> execute or debug -> die( message => $sth->errstr ) ;
101 my $select_arr = $sth -> fetchall_arrayref;
102 if ( scalar @{$select_arr} > 0 ) {
103 debug -> warn( level => 1,
104 message => "Found an old entry in the database matching the ".
105 "current data file" );
106 if ( scalar @{$select_arr} > 1 ) {
107 debug -> warn( level => 1,
108 message => "Found more than one matching entry in database".
109 ", using the first" );
111 $this -> {'data_id'} = $select_arr->[0][0];
112 } else {
113 my ( $date_cmd, $time_cmd );
114 if( $Config{osname} eq 'MSWin32' ){
115 $date_cmd = 'date /T';
116 $time_cmd = 'time /T';
117 } else {
118 # Assuming UNIX
119 $date_cmd = 'date';
120 $time_cmd = 'time';
123 my $date_str = `$date_cmd`;
124 my $time_str = `$time_cmd`;
125 chomp($date_str);
126 chomp($time_str);
127 my $date_time = "$date_str $time_str";
129 $sth = $dbh -> prepare("INSERT INTO psn.data (filename,date,directory,md5sum) ".
130 "VALUES ('$file_str', '$date_time', '$dir_str','".
131 $md5sum."' )");
132 $sth -> execute;
133 $this -> {'data_id'} = $sth->{'mysql_insertid'};
135 $sth -> finish;
136 $dbh -> disconnect;
140 unless ( ( defined $this -> {'header'} and
141 scalar @{$this -> {'header'}} > 0 ) or
142 ( defined $this -> {'individuals'} and
143 scalar @{$this -> {'individuals'}} > 0 ) ) {
144 if ( -e $this -> full_name ) {
145 if ( $this -> {'target'} eq 'mem' ) {
146 &register_in_database if ( $PsN::config -> {'_'} -> {'use_database'} );
147 $this -> _read_header;
148 $this -> _read_individuals;
149 $this -> {'synced'} = 1;
150 } else {
151 $this -> {'synced'} = 0;
153 } else {
154 debug -> die(message => "Error in data -> new: No header, individuals, and no file " . $this -> full_name . " on disk.")
155 unless $this -> {'ignore_missing_files'};
156 $this -> {'synced'} = 0;
158 } else {
159 # if ( defined $this -> {'header'} and
160 # scalar @{$this -> {'header'}} > 0 ) {
161 # if ( -e $this -> full_name ) {
162 # my @given_header = @{$this -> {'header'}};
163 # # $this -> _read_header;
164 # print "HEPP\n";
165 # foreach my $col ( @{$this -> {'header'}} ) {
166 # my $given_col = shift(@given_header);
167 # print "DATA $col, MODEL $given_col\n" if $this -> {'debug'};
168 # # unless( $col eq $given_col ){
169 # # die "data::new: Given header does not match header in file on disk\n";
170 # # }
175 if ( $this -> {'target'} eq 'mem') {
176 if ( -e $this -> {'filename'} ) {
177 $this -> _read_header;
178 &register_in_database if ( $PsN::config -> {'_'} -> {'use_database'} );
179 $this -> _read_individuals;
180 $this -> {'synced'} = 1;
181 } else {
182 confess "Error in data -> new: No file:".$this->{'filename'}." on disk\n"
183 unless $this -> {'ignore_missing_files'};
184 $this -> {'synced'} = 0;
186 } else {
187 $this -> flush;
191 if ( $this -> {'synced'} ) {
192 my $i = 1;
193 foreach my $head ( @{$this -> {'header'}} ) {
194 $this -> {'column_head_indices'} -> {$head} = $i;
195 $i++;
199 end new
201 # }}} new
203 # {{{ full_name
204 start full_name
206 $full_name = $self -> {'directory'} . $self -> {'filename'};
208 end full_name
209 # }}}
211 # {{{ bootstrap
213 start bootstrap
215 # The bootstrap method draws I<samples> number of boostrap
216 # samples from the data set. The I<subjects> arguments
217 # determines the size of each sample (default equals to the
218 # number of individuals in the original data set). The method
219 # returns references to three arrays: I<boot_samples_ref>,
220 # which holds the bootstrap data sets, I<incl_individuals_ref>
221 # which holds arrays containing the subject identifiers (ID's)
222 # for the included individuals of each bootstrap data set and
223 # I<included_keys_ref> which holds the key or index of the
224 # included individuals. The key or index is an integer
225 # starting at 1 for the first individual in the original data
226 # set and increasing by one for each following.
227 $self -> synchronize;
228 my @header = @{$self -> {'header'}};
229 my $individuals = $self -> {'individuals'};
230 my $key_ref;
231 for ( my $i = 1; $i <= $samples; $i++ ) {
232 my $new_name = defined $name_stub ? $name_stub."_$i.dta" : "bs$i.dta";
233 $new_name = $directory.'/'.$new_name;
234 my ( $boot, $incl_ind_ref, $incl_key_ref ) =
235 $self -> resample( subjects => $subjects,
236 resume => $resume,
237 new_name => $new_name,
238 target => $target,
239 stratify_on => $stratify_on );
240 push( @included_keys, $incl_key_ref );
241 push( @incl_individuals, $incl_ind_ref );
242 # $boot -> renumber_ascending;
243 push( @boot_samples, $boot );
244 # $boot -> synchronize;
245 # $boot -> flush;
248 end bootstrap
250 # }}} bootstrap
252 # {{{ resample
254 start resample
256 $self -> synchronize;
257 my ( @header, $individuals, @bs_inds, $key_ref );
258 if ( defined $stratify_on ) {
259 unless ( $resume and -e $new_name ) {
260 @header = @{$self -> {'header'}};
261 $individuals = $self -> {'individuals'};
262 my %strata;
263 if( $stratify_on =~ /\D/ ){
264 %strata = %{$self -> factors( column_head => $stratify_on )};
265 if ( $strata{'Non-unique values found'} eq '1' ) {
266 debug -> die( message => "Individuals were found to have multiple values in the $stratify_on column. ".
267 "The column $stratify_on cannot be used for stratification of the resampling." );
269 } else {
270 %strata = %{$self -> factors( column => $stratify_on )};
271 if ( $strata{'Non-unique values found'} eq '1' ) {
272 debug -> die( message => "Individuals were found to have multiple values in column number $stratify_on. ".
273 "Column $stratify_on cannot be used for stratification of the resampling." );
277 while( my ( $factor, $key_list ) = each %strata ) {
278 my $keys = scalar @{$key_list};
279 for ( my $i = 0; $i < $keys; $i++ ) {
280 my $list_ref = random_uniform_integer(1,0,$keys-1);
281 push( @bs_inds, $individuals ->
282 [ $key_list -> [$list_ref] ] -> copy );
283 push( @included_keys, $key_list -> [$list_ref] );
284 push( @incl_individuals, $individuals ->
285 [ $key_list -> [$list_ref] ] -> idnumber );
289 $boot = data -> new( header => \@header,
290 idcolumn => $self -> {'idcolumn'},
291 ignoresign => $self -> {'ignoresign'},
292 individuals => \@bs_inds,
293 filename => $new_name,
294 ignore_missing_files => 1,
295 target => 'mem' );
296 $boot -> renumber_ascending;
297 $boot -> _write;
298 $boot -> target( $target );
300 } else {
301 unless ( $resume and -e $new_name ) {
302 @header = @{$self -> {'header'}};
303 $individuals = $self -> {'individuals'};
304 for ( my $i = 1; $i <= $subjects; $i++ ) {
305 $key_ref = random_uniform_integer(1,0,scalar @{$individuals}-1);
306 push( @bs_inds, $individuals -> [ $key_ref ] -> copy );
307 push( @included_keys, $key_ref );
308 push( @incl_individuals, $individuals -> [ $key_ref ] -> idnumber );
311 # MUST FIX: If a file already exists with the same name,
312 # the created bs data set will be appended to this. IT
313 # MUST BE OVERWRITTEN!
314 $boot = data -> new( header => \@header,
315 idcolumn => $self -> {'idcolumn'},
316 ignoresign => $self -> {'ignoresign'},
317 individuals => \@bs_inds,
318 filename => $new_name,
319 ignore_missing_files => 1,
320 target => 'mem' );
321 $boot -> renumber_ascending;
322 $boot -> _write;
323 $boot -> target( $target );
324 } else {
325 # If we are resuming, we still need to generate the
326 # pseudo-random sequence and initiate a data object
327 for ( my $i = 1; $i <= $subjects; $i++ ) {
328 random_uniform_integer(1,0,scalar @{$individuals}-1)
330 $boot = data -> new( idcolumn => $self -> {'idcolumn'},
331 ignoresign => $self -> {'ignoresign'},
332 filename => $new_name,
333 ignore_missing_files => 1,
334 target => $target );
336 if( $target eq 'disk'){
337 $boot -> flush;
341 end resample
343 # }}} resample
345 # {{{ case_deletion
347 start case_deletion
349 # case_deletion creates subsets of the data. The number of
350 # subsets is specified by the bins argument. The individuals
351 # of each subset is selected randomly or in ascending
352 # numerical order depending on the selection argument that can
353 # be either 'consecutive' or 'random'. case_column must be
354 # specified to give the method something to base the selection
355 # on. Valid case_column values are either the column number
356 # (pure digits) or the name of the column in the (optional)
357 # header row.
358 $self -> synchronize;
359 my @header = @{$self -> {'header'}};
360 if ( not defined $case_column ) {
361 debug -> die( message => "case_column must be specified" );
362 } else {
363 if ( not $case_column =~ /^\d/ ) {
364 for ( my $i = 0; $i <= $#header; $i++ ) {
365 $case_column = $i+1 if ( $header[$i] eq $case_column );
369 $bins = defined $bins ? $bins :
370 scalar keys %{$self -> factors( column => $case_column)};
371 my %factors = %{$self -> factors( column => $case_column )};
372 if ( $factors{'Non-unique values found'} eq '1' ) {
373 debug -> die( message => "Individuals were found to have multiple values in column number $case_column. ".
374 "Column $case_column cannot be used for case deletion." );
377 my $maxbins = scalar keys %factors;
378 my @ftrs = sort { $a <=> $b } keys %factors;
379 my $individuals = $self -> {'individuals'};
380 my $maxkey = scalar @{$individuals} - 1;
382 my ( @tmp_ftrs, @binsize ) =
383 ((),());
384 my ( $k, $j, $i ) = ( 0, 0, 0 );
385 # Create the binsizes
386 for ( $j = 0; $j < $maxbins; $j++ ) {
387 $binsize[ $k++ ]++;
388 $k = 0 if( $k >= $bins );
390 $self -> _fisher_yates_shuffle( array => \@ftrs ) if( $selection eq 'random' );
391 for ( $k = 0; $k < $bins; $k++ ) {
392 for ( $j = 0; $j < $binsize[ $k ]; $j++ ) {
393 # print "SK: ",$skipped_keys[ $k ]," F: ",$factors{ $ftrs[ $i ] },"\n";
394 push( @{$skipped_keys[ $k ]}, @{$factors{ $ftrs[ $i ] }} );
395 push( @{$skipped_values[ $k ]}, $ftrs[ $i++ ] );
399 for ( $k = 0; $k < $bins; $k++ ) {
400 my @cd_inds = ();
401 SELKEYS: foreach my $key ( 0..$maxkey ) {
402 foreach my $skipped ( @{$skipped_keys[ $k ]} ) {
403 if ( $key == $skipped ) {
404 push( @{$skipped_ids[ $k ]}, $individuals ->
405 [ $skipped ] -> idnumber );
406 next SELKEYS;
409 push( @cd_inds, $individuals -> [ $key ] -> copy );
411 # Set ignore_missing_files = 1 to make it possible to get the result
412 # in memory only
413 my $newdata = data ->
414 new ( header => \@header,
415 ignoresign => $self -> {'ignoresign'},
416 idcolumn => $self -> {'idcolumn'},
417 individuals => \@cd_inds,
418 target => $target,
419 filename => 'cd'.$k+1 .'.dta',
420 ignore_missing_files => 1 );
421 push( @subsets, $newdata );
424 end case_deletion
426 # }}} case_deletion
428 # {{{ copy
429 start copy
431 # filename: new data file name.
433 # target: keep the copy in memory ('mem') or write it to disk and flush the memory ('disk').
435 ($directory, $filename) = OSspecific::absolute_path( $directory, $filename );
437 # Clone self into new data object and set synced to 0 for
438 # the copy
439 cp($self -> full_name, $directory.$filename );
440 $new_data = Storable::dclone( $self );
441 # $new_data -> {'synced'} = 0;
443 # Set the new file name for the copy
444 $new_data -> directory( $directory );
445 $new_data -> filename( $filename );
447 # $new_data -> _write;
449 # my $header;
450 # my $comment;
451 # my $ind_copies;
453 # unless( defined $self -> {'header'} ) {
454 # $self -> synchronize;
457 # my @header = @{$self -> {'header'}};
458 # my @comment = @{$self -> {'comment'}};
459 # $header = \@header;
460 # $comment = \@comment;
461 # my @ind_copies;
462 # foreach my $individual ( @{$self -> {'individuals'}} ) {
463 # push( @ind_copies, $individual -> copy );
465 # $ind_copies = \@ind_copies;
467 # $data_copy = data -> new ( header => $header,
468 # comment => $comment,
469 # ignoresign => $self -> {'ignoresign'},
470 # individuals => $ind_copies,
471 # idcolumn => $self -> {'idcolumn'},
472 # debug => $self -> {'debug'},
473 # filename => $filename,
474 # target => $target,
475 # ignore_missing_files => $ignore_missing_files);
477 # if( $self -> {'target'} eq 'disk' ){
478 # $self -> flush;
481 end copy
483 # }}} copy
485 # {{{ count_ind
487 start count_ind
489 # Returns the number of individuals in the data set.
490 $self -> synchronize;
491 $num = scalar @{$self -> {'individuals'}};
493 end count_ind
495 # }}} count_ind
497 # {{{ filename
498 start filename
500 if ( defined $parm and $parm ne $self -> {'filename'} ) {
501 $self -> {'filename'} = $parm;
502 $self -> {'data_id'} = undef;
503 $self -> _write;
506 end filename
507 # }}} filename
509 # {{{ fractions
511 start fractions
513 my %factors = $self -> factors( 'return_occurences' => 1,
514 'unique_in_individual' => $unique_in_individual,
515 'column_head' => $column_head,
516 'column' => $column);
518 my $sum = 0;
519 while (my ($factor, $amount) = each %factors) {
520 if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
521 next;
522 } else {
523 $sum += $amount;
526 while (my ($factor, $amount) = each %factors) {
527 if ( $factor == $self -> {'missing_data'} && $ignore_missing ) {
528 next;
529 } else {
530 $fractions{$factor} = $amount/$sum;
534 end fractions
536 # }}} fractions
538 # {{{ factors
540 start factors
542 # Either column (number, starting at 1) or column_head must be specified.
544 # The default behaviour is to return a hash with the factors as keys
545 # and as values references to arrays with the order numbers (not the ID numbers)
546 # of the individuals that contain this factor
548 # If unique_in_individual is true (1), the returned hash will contain
549 # an element with key 'Non-unique values found' and value 1 if any
550 # individual contain more than one value in the specified column.
552 # Return occurences will calculate the occurence of each
553 # factor value. Several occurences in one individual counts as
554 # one occurence. The elements of the returned hash will have the factors
555 # as keys and the number of occurences as values.
558 $self -> synchronize;
560 # Check if $column(-index) is defined and valid, else try to find index
561 # using column_head
562 my $first_id = $self -> {'individuals'}[0];
564 debug -> die( message => "No individuals defined in data object based on ".
565 $self -> full_name ) unless ( defined $first_id );
567 unless ( defined $column && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
568 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
569 debug -> die( message => "Error in data -> factors: unknown column: \"$column_head\" ".
570 "or invalid column number: \"$column\".\n".
571 "Valid column numbers are 1 to ".scalar @{$first_id -> subject_data ->[0]}."\n".
572 "Valid column headers are (in no particular order):\n".
573 join(', ',keys(%{$self -> {'column_head_indices'}})) );
574 } else {
575 $column = $self -> {'column_head_indices'}{$column_head};
576 debug -> warn( level => 2,
577 message => "$column_head is in column number $column" );
581 my $key = 0;
582 foreach my $individual ( @{$self -> {'individuals'}} ) {
583 my @ifactors = keys %{$individual -> factors( column => $column )};
584 if ( scalar @ifactors > 1 and $unique_in_individual ) {
585 %factors = ( 'Non-unique values found' => 1 );
586 last;
588 debug -> die( message => "No value found in column $column in individual ".
589 $individual -> idnumber ) if ( scalar @ifactors == 0 );
591 # Return occurences will calculate the occurence of each
592 # factor value. Several occurences in one individual counts as
593 # one occurence.
595 if ( $return_occurences ) {
596 foreach my $ifactor ( @ifactors ) {
597 $factors{$ifactor}++;
599 } else {
600 foreach my $ifactor ( @ifactors ) {
601 push( @{$factors{$ifactor}}, $key );
604 $key++;
607 end factors
609 # }}} factors
611 # {{{ find_individual
613 # start find_individual
614 # foreach my $tmp_ind ( @{$self -> individuals} ) {
615 # if ( $tmp_ind -> key == $key ) {
616 # $individual = $tmp_ind;
617 # last;
620 # if ( defined $individual ) {
621 # if ( $copy ) {
622 # $individual = $individual -> copy;
624 # } else {
625 # print "No individual with key $key found in call to ".
626 # "data -> find_individual\n" if ( $self -> debug );
628 # end find_individual
630 # }}}
632 # {{{ format_data
633 start format_data
635 # format the data for NONMEM (simple comma-separated layout)
636 foreach my $individual ( @{$self -> {'individuals'}} ) {
637 foreach my $row ( @{$individual -> subject_data} ) {
638 push( @form_data, join( ',', @{$row} )."\n" );
642 end format_data
643 # }}} format_data
645 # {{{ have_missing_data
646 start have_missing_data
648 # Either I<column> or I<column_head> must be specified.
650 # This method looks through the data column with index I<column> or
651 # (optional) header name I<column_head> and returns O if no missing
652 # data indicator was found or 1 otherwise.
654 $self -> synchronize;
655 my $first_id = $self -> {'individuals'}[0];
656 debug -> die( message => "No individuals defined in data object based on ".
657 $self -> full_name ) unless ( defined $first_id );
658 unless ( defined $column && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
659 unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
660 die "Error in data -> have_missing_data: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
661 } else {
662 $column = $self -> {'column_head_indices'}{$column_head};
665 $self -> flush if ( $self -> {'target'} eq 'disk' );
666 $return_value = $self -> {'have_missing_data'} -> {$column};
668 end have_missing_data
669 # }}} have_missing_data
671 # {{{ merge
672 start merge
674 #$self -> synchronize;
675 push( @{$self -> {'individuals'}}, @{$mergeobj -> individuals} );
677 end merge
678 # }}} merge
680 # {{{ max
682 start max
684 # Either column or column_head must be specified. Column_head must be a string that
685 # identifies a column in the (optional ) data file header.
687 # The if-statement below used to be a cache of allready calculated
688 # means. But since individuals can be accessed in so many ways, we
689 # don't know when this cache should be updated. Its easier to
690 # recalculate the max. Maybe we can include this optimization in the
691 # future, if it turns out to be a bottleneck
692 # my $tmp_column = $self -> {'column_head_indices'}{$column_head};
693 # if ( defined $self -> {'max'}[$tmp_column] ) {
694 # $return_value = $self -> {'max'}[$tmp_column] ;
695 # } else {
696 $self -> synchronize;
697 my $first_id = $self -> {'individuals'}[0];
698 die "data -> max: No individuals defined in data object based on ",
699 $self -> full_name,"\n" unless defined $first_id;
700 unless ( defined $column && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
701 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
702 die "Error in data -> max: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
703 } else {
704 $column = $self -> {'column_head_indices'}{$column_head};
707 foreach my $individual ( @{$self -> {'individuals'}} ) {
708 my $ifactors = $individual -> factors( 'column' => $column );
709 foreach ( keys %{$ifactors} ) {
710 next if ( $_ == $self -> {'missing_data_token'} );
711 if ( defined ($return_value) ) {
712 $return_value = $_ > $return_value ? $_ : $return_value;
713 } else {
714 $return_value = $_;
719 # $self -> {'max'}[$column] = $return_value;
720 $self -> flush if ( $self -> {'target'} eq 'disk' );
723 end max
725 # }}} max
727 # {{{ min
729 start min
731 # See L</max>.
732 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
734 # The if-statement below used to be a cache of allready calculated
735 # means. But since individuals can be accessed in so many ways, we
736 # don't know when this cache should be updated. Its easier to
737 # recalculate the min. Maybe we can include this optimization in the
738 # future, if it turns out to be a bottleneck
739 # if ( defined $self -> {'min'}[$tmp_column] ) {
740 # $return_value = $self -> {'min'}[$tmp_column] ;
741 # } else {
742 $self -> synchronize;
743 my $first_id = $self -> {'individuals'}[0];
744 die "data -> min: No individuals defined in data object based on ",
745 $self -> full_name,"\n" unless defined $first_id;
746 unless ( defined $column && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
747 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
748 die "Error in data -> min: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
749 } else {
750 $column = $self -> {'column_head_indices'}{$column_head};
753 foreach my $individual ( @{$self -> {'individuals'}} ) {
754 my $ifactors = $individual -> factors( 'column' => $column );
755 foreach ( keys %{$ifactors} ) {
756 next if ( $_ == $self -> {'missing_data_token'} );
757 if ( defined ($return_value) ) {
758 $return_value = $_ < $return_value ? $_ : $return_value;
759 } else {
760 $return_value = $_;
764 # $self -> {'min'}[$column] = $return_value;
765 $self -> flush if ( $self -> {'target'} eq 'disk' );
768 end min
770 # }}} min
772 # {{{ median
774 start median
776 # See L</max>.
777 $self -> synchronize;
778 my $first_id = $self -> {'individuals'}[0];
779 die "data -> median: No individuals defined in data object based on ",
780 $self -> full_name,"\n" unless defined $first_id;
781 unless ( defined $column && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
782 unless(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
783 die "Error in data -> median: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
784 } else {
785 $column = $self -> {'column_head_indices'}{$column_head};
789 if( defined $self -> {'median'}[$column] ){
790 return $self -> {'median'}[$column];
793 my @median_array;
795 foreach my $individual ( @{$self -> {'individuals'}} ) {
796 if( $unique_in_individual ){
797 my $ifactors = $individual -> factors( 'column' => $column );
799 foreach ( keys %{$ifactors} ) {
800 next if ( $_ == $self -> {'missing_data_token'} );
801 push( @median_array, $_ );
803 } else {
804 my $ifactors = $individual -> subject_data;
806 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
807 next if ( $ifactors -> [$i] -> [$column-1] == $self -> {'missing_data_token'} );
808 push(@median_array, $ifactors -> [$i] -> [$column-1]);
812 @median_array = sort {$a <=> $b} @median_array ;
814 if( @median_array % 2 ){
815 $return_value = $median_array[$#median_array / 2];
816 } else {
817 $return_value = ( $median_array[@median_array / 2] +
818 $median_array[(@median_array - 2) / 2] ) / 2;
821 $self -> {'median'}[$column] = $return_value;
823 end median
825 # }}} median
827 # {{{ mean
829 start mean
831 # Returns mean value of a column
832 # If a individual contains more then 1 value (i.e. if an individual has different values in different samples
833 # a mean value of all individuals if calculate first, then the mean value of the column
834 # If hi_cutoff is defined the mean function will cut all value below the cutoff,
835 # and set their value to 0. It's used to calculate the HI-mean/LOW-mean of a column for e.g. Hockey-stick covariates
836 # If both hi_cutoff and low_cutoff are defined only the hi_cutoff will be used.
837 # See L</max>.
838 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
839 $self -> synchronize;
840 my $first_id = $self -> {'individuals'}[0];
841 die "data -> mean: No individuals defined in data object based on ",
842 $self -> full_name,"\n" unless defined $first_id;
843 unless ( defined $column && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
844 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
845 die "Error in data -> mean: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
846 } else {
847 $column = $self -> {'column_head_indices'}{$column_head};
851 ## Here the calculation starts
852 my $num_individuals = 0;
853 my $sum = 0;
855 foreach my $individual ( @{$self ->{'individuals'}} ) {
857 my $ifactors = $individual -> subject_data;
858 my $individual_sum = 0;
860 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
861 next if ( $ifactors -> [$i] -> [$column-1] == $self -> {'missing_data_token'} );
862 if (defined $hi_cutoff) {
863 if ($ifactors->[$i]->[$column-1]>$hi_cutoff) {
864 $individual_sum += $ifactors -> [$i] -> [$column-1]-$hi_cutoff;
867 else {
868 if (defined $low_cutoff) {
869 if ($ifactors->[$i]->[$column-1]<$low_cutoff) {
870 $individual_sum += $low_cutoff - $ifactors -> [$i] -> [$column-1];
873 else {
874 $individual_sum += $ifactors -> [$i] -> [$column-1];
878 $sum += $individual_sum/($#{$ifactors}+1);
879 $num_individuals++;
881 $return_value = $sum / $num_individuals;
884 end mean
886 # }}} mean
888 # {{{ sd
890 start sd
892 # This sub returns standard deviation for a specific column
893 # If there are more than one sample/individual the value used for that specific
894 # individual is the mean value of its samples.
895 # The cut-offs are for hockey stick variables. I.e. If one individual value is
896 # lower than the hi-cutoff the individual value will be zero.
897 # HI_cutoff is used to calculate the HI-mean of a column.
898 # If cut_off is undef it won't be used
899 # See L</max>.
900 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
901 $self -> synchronize;
902 my $first_id = $self -> {'individuals'}[0];
903 die "data -> sd: No individuals defined in data object based on ",
904 $self -> full_name,"\n" unless defined $first_id;
905 unless ( defined $column && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
906 unless (defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})) {
907 die "Error in data -> sd: unknown column: \"$column_head\" or invalid column number: \"$column\"\n";
908 } else {
909 $column = $self -> {'column_head_indices'}{$column_head};
913 ## Here the calculation starts
914 my $num_individuals = 0;
915 my $sum = 0;
916 my $mean;
917 if (defined $hi_cutoff) {
918 $mean = $self->mean(column => $column,
919 hi_cutoff => $hi_cutoff);
920 } elsif (defined $low_cutoff) {
921 $mean = $self->mean(column => $column,
922 low_cutoff => $low_cutoff);
923 } else {
924 $mean = $self->mean(column => $column);
927 foreach my $individual ( @{$self -> {'individuals'}} ) {
928 my $ifactors = $individual -> subject_data;
929 my $individual_sum = 0;
931 for(my $i=0; $i<=$#{$ifactors}; $i++ ) {
932 next if ( $ifactors -> [$i] -> [$column-1] == $self -> {'missing_data_token'} );
933 if (defined $hi_cutoff) {
934 if ($ifactors->[$i]->[$column-1]>$hi_cutoff) {
935 $individual_sum += $ifactors -> [$i] -> [$column-1]-$hi_cutoff;
937 } else {
938 if (defined $low_cutoff) {
939 if ($ifactors->[$i]->[$column-1]<$low_cutoff) {
940 $individual_sum += $low_cutoff - $ifactors -> [$i] -> [$column-1];
942 } else {
943 $individual_sum += $ifactors -> [$i] -> [$column-1];
947 $sum += ($individual_sum/($#{$ifactors}+1) - $mean) ** 2;
948 $num_individuals++;
950 $return_value = (1/($num_individuals-1)*$sum) ** 0.5;
953 end sd
955 # }}} sd
957 # {{{ range
958 start range
960 # See L</max>.
961 my $tmp_column = $self -> {'column_head_indices'}{$column_head};
962 if ( defined $self -> {'range'}[$tmp_column] ) {
963 $return_value = $self -> {'range'}[$tmp_column];
964 } else {
965 my $old_target = $self -> {'target'};
966 $self -> {'target'} = 'mem';
967 $self -> synchronize;
968 $return_value = $self -> max( column => $column,
969 column_head => $column_head ) -
970 $self -> min( column => $column,
971 column_head => $column_head );
972 $self -> {'range'}[$column] = $return_value;
973 if ( $old_target eq 'disk' ) {
974 $self -> flush if ( $self -> {'target'} eq 'disk' );
975 $self -> {'target'} = 'disk';
979 end range
980 # }}} range
982 # {{{ recalc_column
983 start recalc_column
985 # Recalculates a column based on expression. Also, see L</max>.
986 $self -> synchronize;
988 # Check if $column(-index) is defined and valid, else try to find index using column_head
989 my $first_id = $self -> {'individuals'}[0];
990 die "data -> recalc_column: No individuals defined in data object based on ",
991 $self -> full_name,"\n" unless defined $first_id;
992 unless ( defined $column && defined( $first_id -> subject_data ->[0][$column-1] ) ) {
993 if(defined($column_head) && defined($self -> {'column_head_indices'}{$column_head})){
994 die "Error in data -> recalc_column: unknown column: \"$column_head\" or column number: \"$column\"\n";
995 } else {
996 $column = $self -> {'column_head_indices'}{$column_head};
1000 for my $individual ( @{$self -> {'individuals'}} ) {
1001 $individual -> recalc_column( column => $column,
1002 expression => $expression );
1005 end recalc_column
1006 # }}} recalc_column
1008 # {{{ renumber_ascending
1010 start renumber_ascending
1012 # Renumbers the individuals (changes the subject identifiers) so that
1013 # all have unique integer numbers starting with start_at and
1014 # ascending. The primary use of this
1015 # method is not to order the individuals after their identifiers but to
1016 # ensure that all individuals have unique identifiers.
1018 $self -> synchronize;
1019 foreach my $individual ( @{$self -> {'individuals'}} ) {
1020 $individual -> idnumber ( $start_at++ );
1022 $self -> {'synced'} = 0;
1024 end renumber_ascending
1026 # }}} renumber_ascending
1028 # {{{ renumber_descending
1030 start renumber_descending
1032 # See L</renumber_ascending>.
1033 $self -> synchronize;
1034 foreach my $individual ( @{$self -> {'individuals'}} ) {
1035 $individual -> idnumber ( $start_at-- );
1037 $self -> {'synced'} = 0;
1039 end renumber_descending
1041 # }}} renumber_descending
1043 # {{{ single_valued_data
1045 start single_valued_data
1047 # Usage:
1049 # ($single_value_data_set, $remainder, $column_indexes) =
1050 # $data_object -> single_valued_data( subset_name => 'subset.dta',
1051 # remainder_name => 'remainder.dta',
1052 # target => 'disk',
1053 # do_not_test_columns => [1..18,24,26];
1055 # my $single_value_column_indexes = $column_indexes -> [0];
1056 # my $all_other_column_indexes = $column_indexes -> [1];
1058 # Analyses the content of each column, based on the
1059 # ID column, and returns two new data objects: One
1060 # that contains all columns that is has only one value per
1061 # individual and one that contains the
1062 # remainding data. This is useful for creating compact 'extra'
1063 # data sets that can be read in via user-defined sub-routines
1064 # when the number of columns needed exceeds the maximum that
1065 # NONMEM allows (e.g. 20 in NONMEM version V).
1067 # The I<do_not_test_columns> argument specifies on which columns
1068 # to skip the single value test
1070 my @multi_value_flags;
1071 my @individuals = @{$self -> {'individuals'}};
1072 # Initiate the flags:
1073 if ( defined $individuals[0] ) {
1074 my @data = @{$individuals[0] -> {'subject_data'}};
1075 for ( my $i = 0; $i < scalar @{$data[0]}; $i++ ) {
1076 my $dnt_flag = 0;
1077 foreach my $dntc ( @do_not_test_columns ) {
1078 $dnt_flag = 1 if ( $i == $dntc - 1 );
1080 $multi_value_flags[$i] = $dnt_flag;
1082 } else {
1083 die "data -> single_valued_data: No data in ID number 1\n";
1085 # Collect the stats
1086 for ( my $id = 0; $id <= $#individuals; $id++ ) {
1087 my @data = @{$individuals[$id] -> {'subject_data'}};
1088 for ( my $j = 0; $j < scalar @{$data[0]}; $j++ ) {
1089 my %col_unique;
1090 for ( my $i = 0; $i <= $#data; $i++ ) {
1091 $col_unique{$data[$i][$j]}++;
1093 my $factors = scalar keys %col_unique;
1094 $multi_value_flags[$j]++ if ( $factors > 1 );
1097 for ( my $i = 0; $i <= $#multi_value_flags; $i++ ) {
1098 if ( $multi_value_flags[$i] ) {
1099 push ( @{$column_indexes[1]}, $i + 1);
1100 } else {
1101 push ( @{$column_indexes[0]}, $i + 1);
1104 ( $single_value_data_set, $remainder ) =
1105 $self -> subset_vertically( column_indexes => $column_indexes[0],
1106 subset_name => $subset_name,
1107 return_remainder => 1,
1108 remainder_name => $remainder_name,
1109 target => $target,
1110 keep_first_row_only => 1);
1112 end single_valued_data
1114 # }}}
1116 # {{{ subset_vertically
1118 start subset_vertically
1120 # Usage:
1122 # $subset = $data_object -> subset_vertically ( column_indexes => [1,2,6],
1123 # subset_name => 'subset.dta' );
1125 # This basic usage returns a new data object containing
1126 # columns 1,2 and 6 from the original data plus the
1127 # idcolumn. The new data object will be associated with the
1128 # file 'subset.dta'.
1130 # You get the remaining data, i.e. the original data minus
1131 # the created subset by specifying
1133 # ( $subset, $remainder ) =
1134 # $data_object -> subset_vertically ( column_indexes => [1,2,6],
1135 # subset_name => 'subset.dta',
1136 # return_remainder => 1,
1137 # remainder_name => 'remainder.dta' );
1139 # If you would like to flush the created data sets to disk and
1140 # save memory, set the I<target> argument to 'disk'. The
1141 # default value 'mem' will keep the whole data object in RAM.
1143 # The I<keep_first_row_only> argument can be used to reduce
1144 # the size of the subset data obejct by excluding all but the
1145 # first row of data from each individual.
1147 my @individuals = @{$self -> {'individuals'}};
1148 # Create remainder index array if necessary
1149 my @remainder_indexes;
1150 if ( defined $individuals[0] ) {
1151 my @data = @{$individuals[0] -> {'subject_data'}};
1152 my $idcolumn = $individuals[0] -> {'idcolumn'};
1153 # print "IC: $idcolumn\n";
1154 my $id_flag = 0;
1155 foreach my $use_index ( @column_indexes ) {
1156 $id_flag = 1 if ( $use_index == $idcolumn );
1158 if ( $return_remainder ) {
1159 # @remainder_indexes = ( $idcolumn );
1160 for ( my $i = 0; $i < scalar @{$data[0]}; $i++ ) {
1161 my $rem_flag = 1;
1162 foreach my $use_index ( @column_indexes ) {
1163 $rem_flag = 0 if ( $i == $use_index -1 );
1164 # or
1165 # $i == $idcolumn -1 );
1167 push( @remainder_indexes, $i + 1 ) if ( $rem_flag );
1169 unshift( @remainder_indexes, $idcolumn ) if ( $id_flag );
1171 unshift( @column_indexes, $idcolumn ) unless ( $id_flag );
1172 } else {
1173 die "data -> single_valued_data: No data in ID number 1\n";
1176 # print "SS: @column_indexes\n";
1177 # print "R : @remainder_indexes\n";
1179 my @new_ids;
1180 my @new_ids_2;
1181 for ( my $id = 0; $id <= $#individuals; $id++ ) {
1182 my $idnumber = $individuals[$id] -> idnumber;
1183 my $idcolumn = $individuals[$id] -> idcolumn;
1184 my @data = @{$individuals[$id] -> {'subject_data'}};
1185 my @new_data;
1186 my @new_data_2;
1187 my $use_rows = $keep_first_row_only ? 0 : $#data;
1188 for ( my $i = 0; $i <= $use_rows; $i++ ) {
1189 my @new_row;
1190 foreach my $use_index ( @column_indexes ) {
1191 push( @new_row, $data[$i][$use_index-1] );
1193 # print "@new_row $#new_row\n";
1194 push( @new_data, \@new_row );
1196 for ( my $i = 0; $i <= $#data; $i++ ) {
1197 if ( $return_remainder ) {
1198 my @new_row_2;
1199 foreach my $use_index ( @remainder_indexes ) {
1200 push( @new_row_2, $data[$i][$use_index-1] );
1202 # print "@new_row_2 $#new_row_2\n";
1203 push( @new_data_2, \@new_row_2 );
1206 my $new_id = data::individual -> new( idnumber => $idnumber,
1207 idcolumn => $idcolumn,
1208 subject_data => \@new_data );
1209 push( @new_ids, $new_id );
1210 if ( $return_remainder ) {
1211 my $new_id_2;
1212 $new_id_2 = data::individual -> new( idnumber => $idnumber,
1213 idcolumn => $idcolumn,
1214 subject_data => \@new_data_2 );
1215 push( @new_ids_2, $new_id_2 );
1218 my @header = @{$self -> {'header'}};
1219 my @new_header;
1220 foreach my $use_index ( @column_indexes ) {
1221 push( @new_header, @header[$use_index-1] );
1223 my @comment = @{$self -> {'comment'}};
1224 my $comment = \@comment;
1225 $subset = data -> new ( filename => $subset_name,
1226 directory => $self -> {'directory'},
1227 ignoresign => $self -> {'ignoresign'},
1228 header => \@new_header,
1229 comment => $comment,
1230 individuals => \@new_ids,
1231 target => $target,
1232 ignore_missing_files => 1 );
1233 if ( $return_remainder ) {
1234 my @new_header_2;
1235 foreach my $use_index ( @remainder_indexes ) {
1236 push( @new_header_2, @header[$use_index-1] );
1238 $remainder = data -> new ( filename => $remainder_name,
1239 directory => $self -> {'directory'},
1240 ignoresign => $self -> {'ignoresign'},
1241 header => \@new_header_2,
1242 comment => $comment,
1243 individuals => \@new_ids_2,
1244 target => $target,
1245 ignore_missing_files => 1 );
1248 end subset_vertically
1250 # }}}
1252 # {{{ subsets
1254 start subsets
1256 # if ( defined $expression and defined $bins ) {
1257 # die "data -> subset: expression and bins may not both be specified\n";
1259 # if ( not ( defined $expression or defined $bins ) ) {
1260 # die "data -> subset: expression or bins must be specified\n";
1262 $self -> synchronize;
1263 my @header = @{$self -> {'header'}};
1264 my @comment = @{$self -> {'comment'}};
1265 my @subset_ids= ();
1266 my %rnd_ids;
1267 my $key = 0;
1268 my @ids = @{$self -> {'individuals'}};
1269 if ( defined $stratify_on ) {
1270 my $work_data = $self -> copy( filename => 'work_data.dta',
1271 target => 'mem' );
1272 my %strata = %{$work_data -> factors( column => $stratify_on )};
1274 while ( my ( $factor, $keys ) = each %strata ) {
1275 foreach my $key ( @{$keys} ) {
1276 my $rnd_num = rand;
1277 while ( defined $rnd_ids{$factor}{$rnd_num} ) {
1278 $rnd_num = rand;
1280 $rnd_ids{$factor}{$rnd_num} = $ids[$key];
1283 my $first = 1;
1284 while ( my ( $factor, $rnd_nums ) = each %rnd_ids ) {
1285 my @sort_rnd_nums = sort { $a <=> $b } keys %{$rnd_nums};
1286 for ( my $i = 0; $i <= $#sort_rnd_nums; $i ) {
1287 for ( my $j = 0; $j < $bins; $j++ ) {
1288 if ( $first ) {
1289 push( @subset_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy] );
1290 push( @incl_ids, [$rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber] );
1291 } else {
1292 push( @{$subset_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> copy );
1293 push( @{$incl_ids[$j]}, $rnd_ids{$factor}{$sort_rnd_nums[$i]} -> idnumber );
1295 $i++;
1296 last if $i > $#sort_rnd_nums;
1298 $first = 0;
1301 for ( my $j = 0; $j < $bins; $j++ ) {
1302 my $sdata = data -> new ( header => \@header,
1303 comment => \@comment,
1304 ignoresign => $self -> {'ignoresign'},
1305 individuals => $subset_ids[$j],
1306 ignore_missing_files => 1,
1307 target => $target,
1308 idcolumn => $self -> {'idcolumn'},
1309 filename => "subset_$j.dta" );
1310 #$sdata -> _write;
1311 push( @subsets, $sdata );
1313 } else {
1314 for ( my $i = 0; $i <= $#ids; $i++ ) {
1315 my $rnd_num = rand;
1316 while ( defined $rnd_ids{$rnd_num} ) {
1317 $rnd_num = rand;
1319 $rnd_ids{$rnd_num} = $ids[$i];
1321 my @keys = sort { $a <=> $b } keys %rnd_ids;
1322 my $first = 1;
1323 for ( my $i = 0; $i <= $#keys; $i ) {
1324 for ( my $j = 0; $j < $bins; $j++ ) {
1325 if ( $first ) {
1326 push( @subset_ids, [$rnd_ids{$keys[$i]} -> copy] );
1327 push( @incl_ids, [$rnd_ids{$keys[$i]} -> idnumber] );
1328 } else {
1329 push( @{$subset_ids[$j]}, $rnd_ids{$keys[$i]} -> copy );
1330 push( @{$incl_ids[$j]}, $rnd_ids{$keys[$i]} -> idnumber );
1332 $i++;
1333 last if $i > $#keys;
1335 $first = 0;
1337 for ( my $j = 0; $j < $bins; $j++ ) {
1338 my $sdata = data -> new ( header => \@header,
1339 comment => \@comment,
1340 ignoresign => $self -> {'ignoresign'},
1341 individuals => $subset_ids[$j],
1342 ignore_missing_files => 1,
1343 target => $target,
1344 idcolumn => $self -> {'idcolumn'},
1345 filename => "subset_$j.dta" );
1346 #$sdata -> _write;
1347 push( @subsets, $sdata );
1351 end subsets
1353 # }}} subsets
1355 # {{{ subset
1357 start subset
1359 $self -> synchronize;
1360 my @header = @{$self -> {'header'}};
1361 my @comment = @{$self -> {'comment'}};
1362 my @subset_inds = ();
1363 my $key = 0;
1364 foreach my $individual ( @{$self -> {'individuals'}} ) {
1365 if ( $individual -> evaluate_expression( column => $based_on,
1366 expression => $expression ) ) {
1367 push( @subset_inds, $individual -> copy );
1368 push( @incl_individuals, $individual -> idnumber );
1369 push( @included_keys, $key );
1371 $key++;
1373 $subset = data -> new ( header => \@header,
1374 comment => \@comment,
1375 ignoresign => $self -> {'ignoresign'},
1376 individuals => \@subset_inds,
1377 idcolumn => $self -> {'idcolumn'},
1378 filename => "subset.dta" );
1380 end subset
1382 # }}} subset
1384 # {{{ target
1385 start target
1387 if ( $parm eq 'disk' and $self -> {'target'} eq 'mem' ) {
1388 $self -> {'target'} = 'disk';
1389 $self -> flush;
1390 } elsif ( $parm eq 'mem' and $self -> {'target'} eq 'disk' ) {
1391 $self -> {'target'} = 'mem';
1392 $self -> synchronize;
1395 end target
1396 # }}}
1398 # {{{ _write
1400 start _write
1402 die "ERROR: data -> _write: No filename set in data object.\n"
1403 if( $filename eq '' );
1405 if( not defined $self -> {'individuals'} ){
1406 unless( $filename eq $self -> full_name ){
1407 $self -> synchronize;
1411 open(FILE,">$filename") ||
1412 die "Could not create $filename\n";
1413 if ( defined $self -> comment ) {
1414 my @comment = @{$self -> comment};
1415 for ( @comment ) { print ( FILE ); }
1417 if ((defined $self -> header) and (defined $self -> ignoresign)) {
1418 print FILE $self -> ignoresign,join(',',@{$self -> header}),"\n";
1420 my @data = @{$self -> format_data};
1421 for ( @data ) {
1422 print ( FILE );
1424 close(FILE);
1426 if ( $PsN::config -> {'_'} -> {'use_database'} ) {
1427 # Backslashes messes up the sql syntax
1428 my $file_str = $self->{'filename'};
1429 my $dir_str = $self->{'directory'};
1430 $file_str =~ s/\\/\//g;
1431 $dir_str =~ s/\\/\//g;
1433 # md5sum
1434 my $md5sum = md5_hex(OSspecific::slurp_file($self-> full_name ));
1435 my ( $date_cmd, $time_cmd );
1436 if ( $Config{osname} eq 'MSWin32' ) {
1437 $date_cmd = 'date /T';
1438 $time_cmd = 'time /T';
1439 } else {
1440 # Assuming UNIX
1441 $date_cmd = 'date';
1442 $time_cmd = 'time';
1445 my $date_str = `$date_cmd`;
1446 my $time_str = `$time_cmd`;
1447 chomp($date_str);
1448 chomp($time_str);
1449 my $date_time = "$date_str $time_str";
1451 my $dbh = DBI -> connect("DBI:mysql:host=localhost;databse=psn",
1452 "psn", "psn_test",
1454 'RaiseError' => 1});
1455 my $sth;
1456 if ( defined $self -> {'data_id'} ) {
1457 $sth = $dbh -> prepare( "UPDATE psn.data ".
1458 "SET filename='$file_str',date='$date_time',".
1459 "directory='$dir_str',md5sum='$md5sum' ".
1460 "WHERE data_id='".$self -> {'data_id'}."'" );
1461 $sth -> execute or debug -> die( message => $sth->errstr ) ;
1462 } else {
1463 $sth = $dbh -> prepare("INSERT INTO psn.data (filename,date,directory,md5sum) ".
1464 "VALUES ('$file_str', '$date_time', '$dir_str','".
1465 $md5sum."' )");
1466 $sth -> execute;
1467 $self -> {'data_id'} = $sth->{'mysql_insertid'};
1469 $sth -> finish;
1470 $dbh -> disconnect;
1473 end _write
1475 # }}} _write
1477 # {{{ flush
1478 start flush
1480 # synchronizes the object with the file on disk and empties
1481 # most of the objects attributes to save memory.
1482 if( defined $self -> {'individuals'} and
1483 ( !$self -> {'synced'} or $force ) ) {
1484 $self -> _write;
1486 $self -> {'header'} = undef;
1487 $self -> {'comment'} = undef;
1488 $self -> {'individuals'} = undef;
1489 $self -> {'synced'} = 0;
1491 end flush
1492 # }}} flush
1494 # {{{ synchronize
1496 start synchronize
1498 # synchronizes the object with the file on disk
1499 unless( $self -> {'synced'} ){
1500 if( defined $self -> {'individuals'} and
1501 scalar @{$self -> {'individuals'}} > 0 ){
1502 # We should not read new data from file if we
1503 # have an individuals defined?
1504 # Perhaps there should be an attribute
1505 # 'from_file' that overrides this and reads in
1506 # the data from the file specified in filename
1507 # and overwrites whatever the object already
1508 # contains?
1509 # if( -e $self -> {'filename'} ){
1510 # $self -> _read_header;
1511 # $self -> _read_individuals;
1513 $self -> _write;
1514 } else {
1515 if( -e $self -> full_name ){
1516 $self -> _read_header;
1517 $self -> _read_individuals;
1518 } else {
1519 debug -> die( message => "Fatal error: datafile: " . $self -> full_name . " does not exist." );
1520 return;
1524 my $i = 1;
1525 foreach my $head ( @{$self -> {'header'}} ){
1526 $self -> {'column_head_indices'} -> {$head} = $i;
1527 $i++;
1529 $self -> {'synced'} = 1;
1531 end synchronize
1533 # }}} synchronize
1535 # {{{ _fisher_yates_shuffle
1537 start _fisher_yates_shuffle
1539 my $arr_ref = $parm{'array'};
1540 debug -> warn( level => 1,
1541 message => "Array of zero length received" )
1542 if ( scalar @{$arr_ref} < 1 );
1543 my $i;
1544 for ($i = @$arr_ref; --$i; ) {
1545 my $j = random_uniform_integer(1,0,$i);
1546 # my $j = int rand ($i+1);
1547 # print "$j $j_new\n";
1548 @$arr_ref[$i,$j] = @$arr_ref[$j,$i];
1551 end _fisher_yates_shuffle
1553 # }}} _fisher_yates_shuffle
1555 # {{{ _read_header
1557 start _read_header
1559 my $filename = $self -> full_name;
1560 my $ignoresign = $self -> ignoresign;
1561 my ( @data, @new_record, $row, $tmp_row, @header, $hdrstring );
1562 open(DATAFILE,"$filename") ||
1563 die "Could not open $filename for reading";
1564 my $columns;
1565 while (<DATAFILE>) {
1566 $tmp_row = $_;
1567 # @new_record = split(/\,|\s+/,$_);
1568 if ( ! (/^\s*\d+|^\s*\./) ) {
1569 $data[$row] = $tmp_row;
1570 $row++;
1571 } else {
1572 # We have reached the first data-row, return.
1573 $columns = scalar split(/\,\s*|\s+/);
1574 last;
1577 close(DATAFILE);
1578 chomp( $hdrstring = pop(@data));
1579 @header = split(/\,\s*|\s+/,$hdrstring);
1580 $header[0] =~ s/$ignoresign//
1581 if ( defined $self->ignoresign );
1582 shift( @header ) if ( $header[0] eq "" );
1583 $self -> {'header'} = \@header;
1584 $self -> {'comment'} = \@data;
1585 if ( $PsN::config -> {'_'} -> {'use_database'} ) {
1586 my $dbh = DBI -> connect("DBI:mysql:host=localhost;databse=psn",
1587 "psn", "psn_test",
1588 {'RaiseError' => 1});
1589 if ( scalar @header < 1 ) {
1590 for ( my $i = 1; $i <= $columns; $i++ ) {
1591 push( @header, $i );
1594 for ( my $i = 0; $i <= $#header; $i++ ) {
1595 my $sth = $dbh -> prepare("INSERT INTO psn.data_column ".
1596 "(name,number,data_id) ".
1597 "VALUES ('".$header[$i]."', '".($i+1).
1598 "', '".$self -> {'data_id'}."' )");
1599 $sth -> execute;
1600 push( @{$self -> {'data_column_ids'}}, $sth->{'mysql_insertid'} );
1601 $sth -> finish;
1603 $dbh -> disconnect;
1606 end _read_header
1608 # }}} _read_header
1610 # {{{ _read_individuals
1612 start _read_individuals
1614 my $idcol = $self -> idcolumn;
1615 my $filename = $self -> full_name;
1616 debug -> warn( level => 1,
1617 message => "Building array of individuals from file " . $self -> {'filename'} );
1618 open(DATAFILE,"$filename") ||
1619 die "Could not open $filename for reading";
1620 my ( @new_row, $new_ID, $old_ID, @init_data );
1621 my $buffer;
1622 my $lines = 0;
1623 while (sysread DATAFILE, $buffer, 4096) {
1624 $lines += ($buffer =~ tr/\n//);
1626 seek( DATAFILE, 0,0 );
1628 # For status bar:
1629 my $steps = 51;
1630 my $step = $lines/$steps;
1631 my $counter = 1;
1632 my $step_counter = 1;
1634 ui -> print( category => 'scm',
1635 message => "Reading data file: ".$self -> filename );
1637 my ( $sth, $dbh, $first_row_id, $first_value_id );
1638 my $insert = 1;
1639 if ( $PsN::config -> {'_'} -> {'use_database'} ) {
1640 $dbh = DBI -> connect("DBI:mysql:host=localhost;databse=psn",
1641 "psn", "psn_test",
1642 {'RaiseError' => 1});
1643 my $sth = $dbh -> prepare( "SELECT data_row_id FROM psn.data_row ".
1644 "WHERE data_id='".$self -> {'data_id'}."'" );
1645 $sth -> execute or debug -> die( message => $sth->errstr ) ;
1646 my $select_arr = $sth -> fetchall_arrayref;
1647 if ( scalar @{$select_arr} > 0 ) {
1648 for ( my $i = 0; $i < scalar @{$select_arr}; $i++ ) {
1649 push( @{$self -> {'data_row_ids'}}, $select_arr->[$i][0] );
1651 $sth = $dbh -> prepare( "SELECT data_value_id FROM psn.data_value ".
1652 "WHERE data_id='".$self -> {'data_id'}."'" );
1653 $sth -> execute or debug -> die( message => $sth->errstr ) ;
1654 my $select_val = $sth -> fetchall_arrayref;
1655 for ( my $i = 0; $i < scalar @{$select_val}; $i++ ) {
1656 push( @{$self -> {'data_value_ids'}}, $select_val->[$i][0] );
1658 $insert = 0;
1659 $dbh -> disconnect;
1660 } else {
1661 $dbh -> do( "LOCK TABLES psn.data_row WRITE, psn.data_value WRITE" );
1662 $sth = $dbh -> prepare( "SELECT MAX(data_row_id) FROM psn.data_row" );
1663 $sth -> execute or debug -> die( message => $sth->errstr ) ;
1664 my $select_arr = $sth -> fetchall_arrayref;
1665 $first_row_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
1666 $sth = $dbh -> prepare( "SELECT MAX(data_value_id) FROM psn.data_value" );
1667 $sth -> execute or debug -> die( message => $sth->errstr ) ;
1668 my $select_arr = $sth -> fetchall_arrayref;
1669 $first_value_id = defined $select_arr -> [0][0] ? $select_arr -> [0][0] : 0;
1671 $sth -> finish;
1674 my $insert_rows;
1675 my $insert_values;
1676 my $row_counter = 0;
1677 while ( <DATAFILE> ) {
1678 s/^ *//;
1679 my @new_row = split(/\,\s*|\s+/);
1680 # This regexp check is not time consuming.
1681 if ( /^\s*\d+|^\s*\./ ) {
1682 $new_ID = $new_row[$idcol-1]; # index starts at 0
1683 $old_ID = $new_ID if ( not defined $old_ID );
1685 # Check if column miss data at some row (This adds about 30% of init time)
1686 my $mdt = $self -> {'missing_data_token'};
1687 for( my $i = 0; $i <= $#new_row; $i++ ){
1688 $self -> {'have_missing_data'} -> {$i+1} = 1
1689 if( $new_row[$i] == $mdt ); # == is slower but safer than eq
1691 if ( $PsN::config -> {'_'} -> {'use_database'} and $insert ) {
1692 $row_counter++;
1693 $insert_rows = $insert_rows."," if ( defined $insert_rows );
1694 $insert_rows = $insert_rows.
1695 "('$row_counter', '".$self -> {'data_id'}."' )";
1696 for ( my $j = 0; $j <= $#new_row; $j++ ) {
1697 $insert_values = $insert_values."," if ( defined $insert_values );
1698 $insert_values = $insert_values.
1699 "('".$new_row[$j]."', '".
1700 ($first_row_id+$row_counter)."', '".
1701 $self -> {'data_column_ids'}->[$j].
1702 "', '".$self -> {'data_id'}."' )";
1706 if ( $new_ID != $old_ID ) {
1707 my @subject_data = @init_data;
1708 push( @{$self -> {'individuals'}},
1709 data::individual -> new ( idcolumn => $idcol,
1710 subject_data => \@subject_data,
1711 data_id => $self -> {'data_id'} ) );
1712 @init_data = (\@new_row);
1713 } else {
1714 push( @init_data, \@new_row );
1716 $old_ID = $new_ID;
1718 if ( $step > 1 ) {
1719 if ( not ( $counter % $step ) ) {
1720 my $nl = $step_counter == $steps ? "" : "\r";
1721 ui -> print( category => 'scm',
1722 message => ui -> status_bar( sofar => $counter,
1723 goal => $lines,
1724 width => $steps-1 ).$nl,
1725 wrap => 0,
1726 newline => 0 );
1727 $step_counter++;
1730 $counter++;
1732 if ( $PsN::config -> {'_'} -> {'use_database'} and $insert ) {
1733 $dbh -> do("INSERT INTO psn.data_row ".
1734 "(number,data_id) ".
1735 "VALUES ".$insert_rows);
1736 push( @{$self -> {'data_row_ids'}}, ($first_row_id..$first_row_id+$row_counter) );
1737 $dbh -> do( "INSERT INTO psn.data_value ".
1738 "(value,data_row_id,data_column_id,data_id) ".
1739 "VALUES ".$insert_values );
1740 push( @{$self -> {'data_value_ids'}},
1741 ($first_value_id..$first_value_id+($row_counter*
1742 scalar @{$self->{'data_column_ids'}})));
1743 $dbh -> do( "UNLOCK TABLES" );
1744 $dbh -> disconnect;
1747 if ( $#init_data >= 0 ) {
1748 push( @{$self -> {'individuals'}},
1749 data::individual -> new ( idcolumn => $idcol,
1750 subject_data => \@init_data ) );
1752 ui -> print( category => 'scm',
1753 message => " ... done" );
1754 close(DATAFILE);
1755 # $self -> _write( filename => 'test.dta' );
1757 end _read_individuals
1759 # }}} _read_individuals