Merge pull request #5205 from solgenomics/topic/generic_trial_upload
[sgn.git] / lib / SGN / Genefamily.pm
blob9dcb32675ecb9f609196f48f41555a2d07c4d01c
2 =head1 NAME
4 SGN::Genefamily - a class to deal with (currently disk-based) genefamilies for tomato annotation purposes
6 =head1 DESCRIPTION
8 The genefamilies are defined by alignment files in a subdirectory. Thus it is easy to update the family definitions, which will happen frequently over the next two months. Then the gene families will be moved to the database. So this code is only very temporary.
10 =head1 AUTHOR
12 Lukas Mueller <lam87@cornell.edu>
14 =head1 METHODS
16 Methods in this class include:
18 =cut
20 package SGN::Genefamily;
22 use Moose;
24 with 'MooseX::Object::Pluggable';
26 use namespace::autoclean;
27 use Data::Dumper;
28 use File::Slurp qw/slurp/;
29 use File::Spec qw | catfile |;
30 use File::Spec::Functions;
31 use File::Basename qw/basename/;
33 =head2 accessors genefamily_method()
35 =cut
37 has 'genefamily_format' => (
38 is => 'rw',
39 isa => 'Str',
42 has 'genefamily_defs_file' => (
43 is => 'rw',
44 isa => 'Str',
45 default => sub { return 'genefamily_defs.txt' },
48 has 'sequence_link' => (
49 is => 'rw',
50 isa => 'Str',
51 default => sub { return '/tools/genefamily/seq/'; } # add /$build/$family/$seq_id
55 =head2 accessors name()
57 Usage: $gf->name()
58 Property: the name of the gene family
59 Side Effects: will be used to map to the corresponding file name
60 Example:
62 =cut
64 has 'name' => (
65 is => 'rw',
66 # required => 1,
69 # =head2 members
71 # Usage: my @members = $gf->members()
72 # Desc: retrieves the members of a genefamily. Read only.
73 # Property: the members of the gene family
74 # Side Effects:
75 # Example:
77 # =cut
79 # has 'members' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
81 =head2 files_dir
83 Usage: my $dir = $gf->files_dir()
84 Desc: sets the directory where the genefamilies are located.
85 Property: a path
86 Side Effects: used for retrieving gene family information
87 Example:
89 =cut
91 has 'files_dir' => (
92 is => 'rw',
93 required => 1,
96 =head2 build
98 Usage: my $d = $gf->build()
99 Desc: under the genefamily dir (files_dir), a number of sub-dirs
100 should be present, each of which represents a separate
101 gene family clustering (for example, based on different
102 species or different clustering parameters).
103 Property: the build name [string]
104 Side Effects:
105 Example:
107 =cut
109 has 'build' => (
110 is => 'rw',
111 required => 1,
114 =head2 get_alignment
116 Usage: my $alignment = $gf->get_alignment()
117 Desc: returns the alignment as a string
118 Args: none
119 Side Effects: dies if the alignment has not yet been calculated.
120 Example:
122 =cut
124 sub get_alignment {
125 my $self = shift;
126 my $file =
127 catfile( $self->get_path(), "alignments", $self->name() . ".fa.align" );
129 if ( !-e $file ) {
130 die "No alignment file available for family " . $self->name();
133 return slurp($file);
136 =head2 get_fasta
138 Usage: my $fasta = $gf->get_fasta()
139 Desc: returns the sequences of a gene family as a string
140 formatted in fasta.
141 Ret: fasta
142 Args: none
143 Side Effects: dies if the fasta is not available.
144 Example:
146 =cut
148 sub get_fasta {
149 my $self = shift;
150 my $file = catfile( $self->get_path(), "fasta", $self->name() . ".fa" );
152 print STDERR "Retrieving fasta file $file for family ".$self->name()."\n";
153 unless( -f $file ) {
154 die "The fasta information for family "
155 . $self->name()
156 . " cannot be found";
158 return slurp($file);
161 =head2 get_seqs
163 Usage: my $fasta = $gf->get_seqs()
164 Desc: returns the sequences of a gene family as a list of
165 Bio::Seq objects
166 Ret:
167 Args: none
168 Side Effects: dies if the fasta information is not available.
169 Example:
171 =cut
173 sub get_seqs {
174 my $self = shift;
175 my $file = catfile( $self->get_path(), "fasta", $self->name() . ".fa" );
176 if ( !-e $file ) {
177 die "The fasta information for family "
178 . $self->name()
179 . " cannot be found";
181 my @seqs = ();
182 my $io = Bio::SeqIO->new( -format => 'fasta', -file => $file );
183 while ( my $seq = $io->next_seq() ) {
184 push @seqs, $seq;
186 return @seqs;
189 =head2 get_tree
191 Usage:
192 Desc:
193 Ret:
194 Args:
195 Side Effects:
196 Example:
198 =cut
200 sub get_tree {
201 my $self = shift;
202 my $file =
203 catfile( $self->get_path(), "/trees/" . $self->name() . ".tree" );
204 if ( !-e $file ) {
205 die "The tree information for family "
206 . $self->name()
207 . " cannot be found";
209 return slurp($file);
212 =head1 get_sequence
214 =cut
216 sub get_sequence {
217 my $self = shift;
218 my $sequence = shift;
220 my $file = File::Spec->catfile($self->get_path(), 'fasta', $self->name().".fa");
222 my @seqs = ();
223 my $io = Bio::SeqIO->new( -format => 'fasta', -file => $file );
225 while (my $seq = $io->next_seq()) {
226 print STDERR "Now checking id ".$seq->id()." against search term $sequence\n";
227 if ($seq->id() eq $sequence) {
228 return [ $seq->id(), $seq->desc(), $seq->seq() ]
231 return [];
235 =head1 get_members
237 =cut
239 sub get_members {
240 my $self = shift;
241 my $family = shift;
243 my $defs = File::Spec->catfile($self->get_path(), $self->genefamily_defs_file());
245 print STDERR "Getting member info for family $family from file $defs\n";
247 open(my $F, "<", $defs) || die "Can't open gene families definition file at $defs";
249 my @all_members;
250 while(<$F>) {
251 chomp;
253 my ($family_name, @members) = split/\t/;
255 if ($family_name eq $family) {
256 foreach my $m (@members) {
258 my @species_members = split/\,/, $m;
259 foreach my $id (@species_members) {
260 $id = '<a href="'.$self->sequence_link()."/".$self->build()."/$family/$id".'">'.$id."</a>";
262 @all_members = (@all_members, @species_members);
266 return \@all_members;
269 =head2 get_available_builds
271 Usage: my @ds = SGN::Genefamily->get_available_builds($DIR)
272 Desc: a class function that returns the available builds
273 Ret: a list of build names
274 Args: the $DIR where the builds are located.
275 Side Effects:
276 Example:
278 =cut
280 sub get_available_builds {
281 my $class = shift;
282 my $path = shift;
283 my @dirs = map { basename($_) } grep -d, glob $path."/*";
284 return @dirs;
287 sub get_path {
288 my $self = shift;
289 return catfile( $self->files_dir(), $self->build() );
292 sub table {
293 my $self = shift;
295 my $plugin = $self->genefamily_format();
296 $self->load_plugin($plugin);
297 my $table = $self->get_data($self->build());
299 return $table;