Merge pull request #5191 from solgenomics/topic/quality_control
[sgn.git] / lib / CXGN / Phenotypes / File.pm
bloba5aa234c84df440062609a708f2ac275f2108fc4
2 =head1 NAME
4 CXGN::Phenotypes::File - a class to parse out information from the files that phenotype searching returns
6 =head1 DESCRIPTION
8 =head1 AUTHOR
10 Lukas Mueller <lam87@cornell.edu>
12 =cut
14 package CXGN::Phenotypes::File;
16 use Moose;
17 use Data::Dumper;
18 use File::Slurp qw | slurp |;
20 has 'file' => (is => 'rw', isa => 'Str');
22 has 'factors' => ( is => 'rw', isa => 'ArrayRef' );
24 has 'traits' => ( is => 'rw', isa => 'ArrayRef' );
26 has 'levels' => ( is => 'rw', isa => 'HashRef' );
28 has 'remove_quotes' => (is => 'rw', isa => 'Bool', default => sub { return 1; } );
30 our $FACTOR_COUNT = 38; # number of columns in the file before traits columns start
32 sub BUILD {
33 my $self = shift;
34 my @lines = slurp($self->file());
36 my $header = $lines[0];
37 chomp($header);
39 my @keys = split("\t", $header);
41 if ($self->remove_quotes()) {
42 foreach my $k (@keys) {
43 #print STDERR "Removing quotes from $k...";
44 $k=~ s/^\"(.*)\"$/$1/;
45 #print STDERR "Now $k...\n";
50 my @data = ();
51 my %line = ();
52 my %levels = ();
54 for (my $i=1; $i<@lines; $i++) {
55 my @fields = split /\t/, $lines[$i];
56 for(my $n=0; $n <@keys; $n++) {
57 if ($self->remove_quotes()) {
58 #print STDERR "Removing quotes from $fields[$n]...";
59 $fields[$n]=~ s/^\"(.*)\"$/$1/;
60 #print STDERR "Now $fields[$n]...\n";
63 if (exists($fields[$n]) && defined($fields[$n])) {
64 $line{$keys[$n]}=$fields[$n];
65 if ($n<39) {
66 $levels{$keys[$n]}->{fields}->{$fields[$n]}++;
67 $levels{$keys[$n]}->{distinct} = scalar(keys(%{$levels{$keys[$n]}->{fields}}));
71 push @data, \%line;
73 $self->factors( [ @keys[0..$FACTOR_COUNT] ] );
74 $self->traits( [ @keys[ $FACTOR_COUNT+1..scalar(@keys) ] ] );
76 $self->levels(\%levels);
80 sub distinct_levels_for_factor {
81 my $self = shift;
82 my $factor = shift;
84 #print STDERR "LEVELS: ".Dumper($self->levels());
85 return $self->levels()->{$factor}->{distinct};