1 #-----------------------------------------------------------------
3 # BioPerl module Bio::SearchIO::Writer::GbrowseGFF.pm
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Mark Wilkinson <markw@illuminae.com>
9 # You may distribute this module under the same terms as perl itself
10 #-----------------------------------------------------------------
14 Bio::SearchIO::Writer::GbrowseGFF - Interface for outputting parsed search results in Gbrowse GFF format
19 my $in = Bio::SearchIO->new(-file => 'result.blast',
21 my $out = Bio::SearchIO->new(-output_format => 'GbrowseGFF',
24 -file => ">result.gff");
25 while( my $r = $in->next_result ) {
26 $out->write_result($r);
31 This writer produces Gbrowse flavour GFF from a Search::Result object.
33 =head1 AUTHOR Mark Wilkinson
35 Email markw-at-illuminae-dot-com
39 Susan Miller sjmiller at email-DOT-arizon-DOT-edu
40 Jason Stajich jason at bioperl-dot-org
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to
48 the Bioperl mailing list. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
55 Please direct usage questions or support issues to the mailing list:
57 I<bioperl-l@bioperl.org>
59 rather than to the module maintainer directly. Many experienced and
60 reponsive experts will be able look at the problem and quickly
61 address it. Please include a thorough description of the problem
62 with code and data examples if at all possible.
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 of the bugs and their resolution. Bug reports can be submitted via
70 https://github.com/bioperl/bioperl-live/issues
74 The rest of the documentation details each of the object methods.
75 Internal methods are usually preceded with a _
80 package Bio
::SearchIO
::Writer
::GbrowseGFF
;
81 use vars
qw(%Defaults);
84 $Defaults{'Prefix'} = 'EST';
85 $Defaults{'HSPTag'} = 'HSP';
86 $Defaults{'MatchTag'} = 'match';
88 use base qw(Bio::Root::Root Bio::SearchIO::SearchWriterI);
94 Usage : my $obj = Bio::SearchIO::Writer::GbrowseGFF->new(@args);
95 Function: Builds a new Bio::SearchIO::Writer::GbrowseGFF object
96 Returns : an instance of Bio::SearchIO::Writer::GbrowseGFF
97 Args : -e_value => 10 : set e_value parsing cutoff (default undef)
98 (note the -e_value flag is deprecated.)
103 my($class,@args) = @_;
105 my $self = $class->SUPER::new
(@args);
109 $self->{'_signif'} ) = $self->_rearrange([qw(E_VALUE OUTPUT_CIGAR PREFIX
110 OUTPUT_SIGNIF)], @args);
111 $self->{'_evalue'} && warn( "Use of the -e_value argument is deprecated.\nIn future, use \$writer->filter(\"type\", \&code) instead.\n\tparsing will proceed correctly with this e_value\n");
112 $self->{Gbrowse_HSPID
} = 0;
113 $self->{Gbrowse_HITID
} = 0;
114 $self->{'_prefix'} ||= $Defaults{'Prefix'};
120 return ++$self->{Gbrowse_HSPID
};
125 return ++$self->{Gbrowse_HITID
}
127 # according to the GFF3 spec:
128 #"match". In addition to the generic "match"
129 #type, there are the subclasses "cDNA_match," "EST_match,"
130 #"translated_nucleotide_match," "nucleotide_to_protein_match," and
135 Purpose : Produce the Gbrowse format GFF lines for a Result
136 Usage : print $writer->to_string( $result_obj, @args);
137 Argument : $result_obj = A Bio::Search::Result::ResultI object
138 -version => 1|2|2.5|3 ; the GFF format you want to output (default 3)
139 -match_tag => match|cDNA_match|EST_match|translated_nucleotide_match
140 nucleotide_to_protein_match|nucleotide_motif
141 This is the SO term to be placed in GFF column 3.
142 -prefix => String to prefix the group by, default is EST
143 (see %Defaults class variable) A default can also
144 be set on object init
145 Returns : String containing data for each search Result or any of its
146 : sub-objects (Hits and HSPs).
151 #-reference => 'hit'|'query' ; whether the hit sequence name or the
152 # query sequence name is used as the
153 # 'reference' sequence (GFF column 1)
156 my ($self, $result, @args) = @_;
157 my ($format, $reference,
159 $prefix) = $self->_rearrange([qw
164 $self->warn($reference) if $reference;
165 $reference ||='hit'; # default is that the hit sequence (db sequence) becomes the reference sequence. I think this is fairly typical...
166 $match_tag ||= $Defaults{'MatchTag'}; # default is the generic 'match' tag.
167 $hsp_tag ||= $Defaults{'HSPTag'}; # default is the generic 'hsp' tag.
168 $prefix ||= $self->{'_prefix'};
169 $self->throw("$reference must be one of 'query', or 'hit'\n") unless $reference;
171 #************* THIS IS WHERE I STOPPED ****************
172 # *****************************************************
173 #*************************************************
176 my $gffio = Bio
::Tools
::GFF
->new(-gff_version
=> $format); # try to set it
178 # just in case that behaviour changes (at the moment, an invalid format throws an exception, but it might return undef in the future
179 return "" unless defined $gffio; # be kind and don't return undef in case the person is putting the output directly into a print statement without testing it
180 # now $gffio is either false, or a valid GFF formatter
182 my ($GFF,$cigar,$score);
183 my ($resultfilter,$hitfilter,$hspfilter) = (
184 $self->filter('RESULT'),
185 $self->filter('HIT'),
186 $self->filter('HSP'));
187 $result->can('rewind') && $result->rewind(); # ensure we're at the beginning
188 next if (defined $resultfilter && ! (&{$resultfilter}($result)) );
190 while( my $hit = $result->next_hit ) {
192 if (defined $self->{_evalue
}){
193 next unless ($hit->significance < $self->{_evalue
});
195 next if( defined $hitfilter && ! &{$hitfilter}($hit) ); # test against filter code
197 my $refseq = $reference eq 'hit' ?
$hit->name : $result->query_name;
198 my $seqname = $reference eq 'hit' ?
$result->query_name : $hit->name; # hopefully this will be a simple identifier without a full description line!!
199 if ($self->{_signif
}) {
200 $score = $hit->significance;
202 $score = $hit->raw_score;
204 $self->throw("No reference sequence name found in hit; required for GFF (this may not be your fault if your report type does not include reference sequence names)\n") unless $refseq;
205 my $source = $hit->algorithm;
206 $self->throw("No algorithm name found in hit; required for GFF (this may not be your fault if your report type does not include algorithm names)\n") unless $refseq;
207 $self->throw("This module only works on BLASTN reports at this time. Sorry.\n") unless $source eq "BLASTN";
212 # pre-process the HSP's because we later need to know
213 # the extents of the plus and munus strand
214 # on both the subject and query strands individually
215 my ($qpmin, $qpmax, $qmmin, $qmmax, $spmin, $spmax, $smmin, $smmax); # variables for the plus/minus strand min start and max end to know the full extents of the hit
216 while( my $hsp = $hit->next_hsp ) {
217 if ( defined $self->{_evalue
} ) {
218 # for backward compatibility only
219 next unless ($hsp->significance < $self->{_evalue
});
221 next if( defined $hspfilter && ! &{$hspfilter}($hsp) ); # test against HSP filter
222 if ($hsp->hit->strand >= 0 ){
223 push @plus_hsps, $hsp;
224 if (defined $qpmin){ # set or reset the minimum and maximum extent of the plus-strand hit
225 $qpmin = $hsp->query->start if $hsp->query->start < $qpmin;
226 $qpmax = $hsp->query->end if $hsp->query->end > $qpmax;
227 $spmin = $hsp->hit->start if $hsp->hit->start < $spmin;
228 $spmax = $hsp->hit->end if $hsp->hit->end > $spmax;
230 $qpmin = $hsp->query->start;
231 $qpmax = $hsp->query->end;
232 $spmin = $hsp->hit->start;
233 $spmax = $hsp->hit->end;
236 if ($hsp->hit->strand < 0 ){
237 push @minus_hsps, $hsp;
238 if (defined $qmmin){ # set or reset the minimum and maximum extent of the minus-strand hit
239 $qmmin = $hsp->query->start if $hsp->query->start < $qmmin;
240 $qmmax = $hsp->query->end if $hsp->query->end > $qmmax;
241 $smmin = $hsp->hit->start if $hsp->hit->start < $smmin;
242 $smmax = $hsp->hit->end if $hsp->hit->end > $smmax;
244 $qmmin = $hsp->query->start;
245 $qmmax = $hsp->query->end;
246 $smmin = $hsp->hit->start;
247 $smmax = $hsp->hit->end;
250 #else next if there is no strand, but that makes no sense..??
252 next unless (scalar(@plus_hsps) + scalar(@minus_hsps)); # next if no hsps (??)
253 my $ID = $self->_incrementHIT();
254 # okay, write out the index line for the entire hit before
256 # unfortunately (or not??), HitI objects do not implement
257 # SeqFeatureI, so we can't just call ->gff_string
258 # as a result, this module is quite brittle to changes
259 # in the GFF format since we are hard-coding the GFF output here :-(
261 if (scalar(@plus_hsps)){
262 my %tags = ( 'ID' => "match_sequence$ID");
265 $tags{'Target'} = "$prefix:$seqname";
266 $tags{'tstart'} = $qmmin;
267 $tags{'tend'} = $qmmax;
269 $tags{'Target'} = "$prefix:$seqname $qpmin $qpmax";
271 if ( $self->{'_cigar'} ) {
272 $tags{'Gap'} = $cigar;
274 my $feat = Bio
::SeqFeature
::Generic
->new(
276 -source_tag
=> $source,
277 -primary_tag
=> $match_tag,
287 my $formatter = Bio
::Tools
::GFF
->new(-gff_version
=> $format);
288 $GFF .= $feat->gff_string($formatter)."\n";
290 if (scalar(@minus_hsps)){
291 my %tags = ( 'ID' => "match_sequence$ID");
294 $tags{'Target'} = "$prefix:$seqname";
295 $tags{'tstart'} = $qpmax;
296 $tags{'tend'} = $qpmin;
299 $tags{'Target'} = "$prefix:$seqname $qpmax $qpmin";
301 my $feat = Bio
::SeqFeature
::Generic
->new(
303 -source_tag
=> $source,
304 -primary_tag
=> $match_tag,
313 my $formatter = Bio
::Tools
::GFF
->new(-gff_version
=> $format);
314 $GFF .= $feat->gff_string($formatter)."\n";
317 # process + strand hsps
318 foreach my $hsp (@plus_hsps){
319 my $hspID = $self->_incrementHSP();
320 my $qstart = $hsp->query->start;
321 my $qend = $hsp->query->end;
322 my $sstart = $hsp->hit->start;
323 my $send = $hsp->hit->end;
324 my $score = $hsp->score;
326 my %tags = ( 'ID' => "match_hsp$hspID",
327 'Parent' => "match_sequence$ID" );
330 $tags{'Target'} = "$prefix:$seqname";
331 $tags{'tstart'} = $qstart;
332 $tags{'tend'} = $qend;
335 $tags{'Target'} = "$prefix:$seqname $qstart $qend";
337 if ( $self->{'_cigar'} ) {
338 $tags{'Gap'} = $hsp->cigar_string;
341 my $feat = Bio
::SeqFeature
::Generic
->new(
343 -source_tag
=> $source,
344 -primary_tag
=> $hsp_tag,
353 my $formatter = Bio
::Tools
::GFF
->new(-gff_version
=> $format);
354 $GFF .= $feat->gff_string($formatter)."\n";
357 foreach my $hsp (@minus_hsps) {
358 my $hspID = $self->_incrementHSP();
359 my $qstart = $hsp->query->start;
360 my $qend = $hsp->query->end;
361 my $sstart = $hsp->hit->start;
362 my $send = $hsp->hit->end;
363 my $score = $hsp->score;
365 my %tags = ( 'ID' => "match_hsp$hspID",
366 'Parent' => "match_sequence$ID" );
369 $tags{'Target'} = "$prefix:$seqname";
370 $tags{'tstart'} = $qend;
371 $tags{'tend'} = $qstart;
374 $tags{'Target'} = "$prefix:$seqname $qend $qstart";
376 if ( $self->{'_cigar'} ) {
377 $tags{'Gap'} = $hsp->cigar_string;
380 my $feat = Bio
::SeqFeature
::Generic
->new(
382 -source_tag
=> $source,
383 -primary_tag
=> $hsp_tag,
392 my $formatter = Bio
::Tools
::GFF
->new(-gff_version
=> $format);
393 $GFF .= $feat->gff_string($formatter) ."\n";
399 sub significance_filter
{
400 my ($self,$method,$code) = @_;
401 return unless $method;
402 $method = uc($method);
403 if( $method ne 'HSP' &&
405 $method ne 'RESULT' ) {
406 $self->warn("Unknown method $method");
410 $self->throw("Must provide a valid code reference") unless ref($code) =~ /CODE/;
411 $self->{$method} = $code;
413 return $self->{$method};
419 Usage : $self->start_report()
420 Function: has no function, returns nothing
421 Returns : empty string
426 sub start_report
{ return '' }
431 Usage : $self->end_report()
432 Function: has no function, returns nothing
433 Returns : empty string
439 sub end_report
{ return '' }
444 Usage : $writer->filter('hsp', \&hsp_filter);
445 Function: Filter out either at HSP,Hit,or Result level
447 Args : string => data type,
449 Note : GbrowseGFF.pm makes no changes to the default filter code