4 SGN::Genefamily - a class to deal with (currently disk-based) genefamilies for tomato annotation purposes
8 The genefamilies are defined by alignment files in a subdirectory. Thus it is easy to update the family definitions, which will happen frequently over the next two months. Then the gene families will be moved to the database. So this code is only very temporary.
12 Lukas Mueller <lam87@cornell.edu>
16 Methods in this class include:
20 package SGN
::Genefamily
;
24 with
'MooseX::Object::Pluggable';
26 use namespace
::autoclean
;
28 use File
::Slurp qw
/slurp/;
29 use File
::Spec qw
| catfile
|;
30 use File
::Spec
::Functions
;
31 use File
::Basename qw
/basename/;
33 =head2 accessors genefamily_method()
37 has
'genefamily_format' => (
42 has
'genefamily_defs_file' => (
45 default => sub { return 'genefamily_defs.txt' },
48 has
'sequence_link' => (
51 default => sub { return '/tools/genefamily/seq/'; } # add /$build/$family/$seq_id
55 =head2 accessors name()
58 Property: the name of the gene family
59 Side Effects: will be used to map to the corresponding file name
71 # Usage: my @members = $gf->members()
72 # Desc: retrieves the members of a genefamily. Read only.
73 # Property: the members of the gene family
79 # has 'members' => ( is => 'ro', isa => 'ArrayRef', default => sub { [] } );
83 Usage: my $dir = $gf->files_dir()
84 Desc: sets the directory where the genefamilies are located.
86 Side Effects: used for retrieving gene family information
98 Usage: my $d = $gf->build()
99 Desc: under the genefamily dir (files_dir), a number of sub-dirs
100 should be present, each of which represents a separate
101 gene family clustering (for example, based on different
102 species or different clustering parameters).
103 Property: the build name [string]
116 Usage: my $alignment = $gf->get_alignment()
117 Desc: returns the alignment as a string
119 Side Effects: dies if the alignment has not yet been calculated.
127 catfile
( $self->get_path(), "alignments", $self->name() . ".fa.align" );
130 die "No alignment file available for family " . $self->name();
138 Usage: my $fasta = $gf->get_fasta()
139 Desc: returns the sequences of a gene family as a string
143 Side Effects: dies if the fasta is not available.
150 my $file = catfile
( $self->get_path(), "fasta", $self->name() . ".fa" );
152 print STDERR
"Retrieving fasta file $file for family ".$self->name()."\n";
154 die "The fasta information for family "
156 . " cannot be found";
163 Usage: my $fasta = $gf->get_seqs()
164 Desc: returns the sequences of a gene family as a list of
168 Side Effects: dies if the fasta information is not available.
175 my $file = catfile
( $self->get_path(), "fasta", $self->name() . ".fa" );
177 die "The fasta information for family "
179 . " cannot be found";
182 my $io = Bio
::SeqIO
->new( -format
=> 'fasta', -file
=> $file );
183 while ( my $seq = $io->next_seq() ) {
203 catfile
( $self->get_path(), "/trees/" . $self->name() . ".tree" );
205 die "The tree information for family "
207 . " cannot be found";
218 my $sequence = shift;
220 my $file = File
::Spec
->catfile($self->get_path(), 'fasta', $self->name().".fa");
223 my $io = Bio
::SeqIO
->new( -format
=> 'fasta', -file
=> $file );
225 while (my $seq = $io->next_seq()) {
226 print STDERR
"Now checking id ".$seq->id()." against search term $sequence\n";
227 if ($seq->id() eq $sequence) {
228 return [ $seq->id(), $seq->desc(), $seq->seq() ]
243 my $defs = File
::Spec
->catfile($self->get_path(), $self->genefamily_defs_file());
245 print STDERR
"Getting member info for family $family from file $defs\n";
247 open(my $F, "<", $defs) || die "Can't open gene families definition file at $defs";
253 my ($family_name, @members) = split/\t/;
255 if ($family_name eq $family) {
256 foreach my $m (@members) {
258 my @species_members = split/\,/, $m;
259 foreach my $id (@species_members) {
260 $id = '<a href="'.$self->sequence_link()."/".$self->build()."/$family/$id".'">'.$id."</a>";
262 @all_members = (@all_members, @species_members);
266 return \
@all_members;
269 =head2 get_available_builds
271 Usage: my @ds = SGN::Genefamily->get_available_builds($DIR)
272 Desc: a class function that returns the available builds
273 Ret: a list of build names
274 Args: the $DIR where the builds are located.
280 sub get_available_builds
{
283 my @dirs = map { basename
($_) } grep -d
, glob $path."/*";
289 return catfile
( $self->files_dir(), $self->build() );
295 my $plugin = $self->genefamily_format();
296 $self->load_plugin($plugin);
297 my $table = $self->get_data($self->build());