remove code duplication in graft validation.
[sgn.git] / lib / SGN / Genefamily.pm
blob1588ffca1a7ed8906b42a1b622f1e896e7f49142
2 =head1 NAME
4 SGN::Genefamily - a class to deal with (currently disk-based) genefamilies for tomato annotation purposes
6 =head1 DESCRIPTION
8 The genefamilies are defined by alignment files in a subdirectory. Thus it is easy to update the family definitions, which will happen frequently over the next two months. Then the gene families will be moved to the database. So this code is only very temporary.
10 =head1 AUTHOR
12 Lukas Mueller <lam87@cornell.edu>
14 =head1 METHODS
16 Methods in this class include:
18 =cut
20 package SGN::Genefamily;
22 use Moose;
23 use namespace::autoclean;
24 use File::Slurp qw/slurp/;
25 use File::Spec::Functions;
26 use File::Basename qw/basename/;
28 =head2 accessors name()
30 Usage: $gf->name()
31 Property: the name of the gene family
32 Side Effects: will be used to map to the corresponding file name
33 Example:
35 =cut
37 has 'name' => (
38 is => 'rw',
39 # required => 1,
42 =head2 members
44 Usage: my @members = $gf->members()
45 Desc: retrieves the members of a genefamily. Read only.
46 Property: the members of the gene family
47 Side Effects:
48 Example:
50 =cut
52 has 'members' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
54 =head2 files_dir
56 Usage: my $dir = $gf->files_dir()
57 Desc: sets the directory where the genefamilies are located.
58 Property: a path
59 Side Effects: used for retrieving gene family information
60 Example:
62 =cut
64 has 'files_dir' => (
65 is => 'rw',
66 required => 1,
69 =head2 dataset
71 Usage: my $d = $gf->dataset()
72 Desc: under the genefamily dir (files_dir), a number of sub-dirs
73 should be present, each of which represents a separate
74 gene family clustering (for example, based on different
75 species or different clustering parameters).
76 Property: the dataset name [string]
77 Side Effects:
78 Example:
80 =cut
82 has 'dataset' => (
83 is => 'rw',
84 required => 1,
87 =head2 get_alignment
89 Usage: my $alignment = $gf->get_alignment()
90 Desc: returns the alignment as a string
91 Args: none
92 Side Effects: dies if the alignment has not yet been calculated.
93 Example:
95 =cut
97 sub get_alignment {
98 my $self = shift;
99 my $file =
100 catfile( $self->get_path(), "alignments", $self->name() . ".fa.align" );
102 if ( !-e $file ) {
103 die "No alignment file available for family " . $self->name();
106 return slurp($file);
109 =head2 get_fasta
111 Usage: my $fasta = $gf->get_fasta()
112 Desc: returns the sequences of a gene family as a string
113 formatted in fasta.
114 Ret: fasta
115 Args: none
116 Side Effects: dies if the fasta is not available.
117 Example:
119 =cut
121 sub get_fasta {
122 my $self = shift;
123 my $file = catfile( $self->get_path(), "fasta", $self->name() . ".fa" );
124 unless( -f $file ) {
125 die "The fasta information for family "
126 . $self->name()
127 . " cannot be found";
129 return slurp($file);
132 =head2 get_seqs
134 Usage: my $fasta = $gf->get_seqs()
135 Desc: returns the sequences of a gene family as a list of
136 Bio::Seq objects
137 Ret:
138 Args: none
139 Side Effects: dies if the fasta information is not available.
140 Example:
142 =cut
144 sub get_seqs {
145 my $self = shift;
146 my $file = catfile( $self->get_path(), "fasta", $self->name() . ".fa" );
147 if ( !-e $file ) {
148 die "The fasta information for family "
149 . $self->name()
150 . " cannot be found";
152 my @seqs = ();
153 my $io = Bio::SeqIO->new( -format => 'fasta', -file => $file );
154 while ( my $seq = $io->next_seq() ) {
155 push @seqs, $seq;
157 return @seqs;
160 =head2 get_tree
162 Usage:
163 Desc:
164 Ret:
165 Args:
166 Side Effects:
167 Example:
169 =cut
171 sub get_tree {
172 my $self = shift;
173 my $file =
174 catfile( $self->get_path(), "/trees/" . $self->name() . ".tree" );
175 if ( !-e $file ) {
176 die "The tree information for family "
177 . $self->name()
178 . " cannot be found";
180 return slurp($file);
183 sub get_member_ids {
184 my $self = shift;
188 =head2 get_available_datasets
190 Usage: my @ds = SGN::Genefamily->get_available_datasets($DIR)
191 Desc: a class function that returns the available datasets
192 Ret: a list of dataset names
193 Args: the $DIR where the datasets are located.
194 Side Effects:
195 Example:
197 =cut
199 sub get_available_datasets {
200 my $class = shift;
201 my $path = shift;
202 my @dirs = map { basename($_) } grep -d, glob "$path/*";
203 return @dirs;
206 sub get_path {
207 my $self = shift;
208 return catfile( $self->files_dir(), $self->dataset() );