ide/bioperl-mode: into its own repository to be developed separately
[bioperl-live.git] / Bio / SearchIO / Writer / GbrowseGFF.pm
blob6e34d6a47096f952506ab0d2c53d2c37fe30c139
1 #-----------------------------------------------------------------
3 # BioPerl module Bio::SearchIO::Writer::GbrowseGFF.pm
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Mark Wilkinson <markw@illuminae.com>
9 # You may distribute this module under the same terms as perl itself
10 #-----------------------------------------------------------------
12 =head1 NAME
14 Bio::SearchIO::Writer::GbrowseGFF - Interface for outputting parsed search results in Gbrowse GFF format
16 =head1 SYNOPSIS
18 use Bio::SearchIO;
19 my $in = Bio::SearchIO->new(-file => 'result.blast',
20 -format => 'blast');
21 my $out = Bio::SearchIO->new(-output_format => 'GbrowseGFF',
22 -output_cigar => 1,
23 -output_signif => 1,
24 -file => ">result.gff");
25 while( my $r = $in->next_result ) {
26 $out->write_result($r);
29 =head1 DESCRIPTION
31 This writer produces Gbrowse flavour GFF from a Search::Result object.
33 =head1 AUTHOR Mark Wilkinson
35 Email markw-at-illuminae-dot-com
37 =head1 CONTRIBUTORS
39 Susan Miller sjmiller at email-DOT-arizon-DOT-edu
40 Jason Stajich jason at bioperl-dot-org
42 =head1 FEEDBACK
44 =head2 Mailing Lists
46 User feedback is an integral part of the evolution of this and other
47 Bioperl modules. Send your comments and suggestions preferably to
48 the Bioperl mailing list. Your participation is much appreciated.
50 bioperl-l@bioperl.org - General discussion
51 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
53 =head2 Support
55 Please direct usage questions or support issues to the mailing list:
57 I<bioperl-l@bioperl.org>
59 rather than to the module maintainer directly. Many experienced and
60 reponsive experts will be able look at the problem and quickly
61 address it. Please include a thorough description of the problem
62 with code and data examples if at all possible.
64 =head2 Reporting Bugs
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 of the bugs and their resolution. Bug reports can be submitted via
68 the web:
70 https://github.com/bioperl/bioperl-live/issues
72 =head1 APPENDIX
74 The rest of the documentation details each of the object methods.
75 Internal methods are usually preceded with a _
77 =cut
80 package Bio::SearchIO::Writer::GbrowseGFF;
81 use vars qw(%Defaults);
82 use strict;
84 $Defaults{'Prefix'} = 'EST';
85 $Defaults{'HSPTag'} = 'HSP';
86 $Defaults{'MatchTag'} = 'match';
88 use base qw(Bio::Root::Root Bio::SearchIO::SearchWriterI);
91 =head2 new
93 Title : new
94 Usage : my $obj = Bio::SearchIO::Writer::GbrowseGFF->new(@args);
95 Function: Builds a new Bio::SearchIO::Writer::GbrowseGFF object
96 Returns : an instance of Bio::SearchIO::Writer::GbrowseGFF
97 Args : -e_value => 10 : set e_value parsing cutoff (default undef)
98 (note the -e_value flag is deprecated.)
100 =cut
102 sub new {
103 my($class,@args) = @_;
105 my $self = $class->SUPER::new(@args);
106 ($self->{'_evalue'},
107 $self->{'_cigar'},
108 $self->{'_prefix'},
109 $self->{'_signif'} ) = $self->_rearrange([qw(E_VALUE OUTPUT_CIGAR PREFIX
110 OUTPUT_SIGNIF)], @args);
111 $self->{'_evalue'} && warn( "Use of the -e_value argument is deprecated.\nIn future, use \$writer->filter(\"type\", \&code) instead.\n\tparsing will proceed correctly with this e_value\n");
112 $self->{Gbrowse_HSPID} = 0;
113 $self->{Gbrowse_HITID} = 0;
114 $self->{'_prefix'} ||= $Defaults{'Prefix'};
115 return $self;
118 sub _incrementHSP {
119 my ($self) = @_;
120 return ++$self->{Gbrowse_HSPID};
123 sub _incrementHIT {
124 my ($self) = @_;
125 return ++$self->{Gbrowse_HITID}
127 # according to the GFF3 spec:
128 #"match". In addition to the generic "match"
129 #type, there are the subclasses "cDNA_match," "EST_match,"
130 #"translated_nucleotide_match," "nucleotide_to_protein_match," and
131 #"nucleotide_motif."
133 =head2 to_string
135 Purpose : Produce the Gbrowse format GFF lines for a Result
136 Usage : print $writer->to_string( $result_obj, @args);
137 Argument : $result_obj = A Bio::Search::Result::ResultI object
138 -version => 1|2|2.5|3 ; the GFF format you want to output (default 3)
139 -match_tag => match|cDNA_match|EST_match|translated_nucleotide_match
140 nucleotide_to_protein_match|nucleotide_motif
141 This is the SO term to be placed in GFF column 3.
142 -prefix => String to prefix the group by, default is EST
143 (see %Defaults class variable) A default can also
144 be set on object init
145 Returns : String containing data for each search Result or any of its
146 : sub-objects (Hits and HSPs).
147 Throws : n/a
149 =cut
151 #-reference => 'hit'|'query' ; whether the hit sequence name or the
152 # query sequence name is used as the
153 # 'reference' sequence (GFF column 1)
155 sub to_string {
156 my ($self, $result, @args) = @_;
157 my ($format, $reference,
158 $match_tag,$hsp_tag,
159 $prefix) = $self->_rearrange([qw
160 (VERSION
161 REFERENCE
162 MATCH_TAG HSP_TAG
163 PREFIX)], @args);
164 $self->warn($reference) if $reference;
165 $reference ||='hit'; # default is that the hit sequence (db sequence) becomes the reference sequence. I think this is fairly typical...
166 $match_tag ||= $Defaults{'MatchTag'}; # default is the generic 'match' tag.
167 $hsp_tag ||= $Defaults{'HSPTag'}; # default is the generic 'hsp' tag.
168 $prefix ||= $self->{'_prefix'};
169 $self->throw("$reference must be one of 'query', or 'hit'\n") unless $reference;
171 #************* THIS IS WHERE I STOPPED ****************
172 # *****************************************************
173 #*************************************************
175 $format ||='3';
176 my $gffio = Bio::Tools::GFF->new(-gff_version => $format); # try to set it
178 # just in case that behaviour changes (at the moment, an invalid format throws an exception, but it might return undef in the future
179 return "" unless defined $gffio; # be kind and don't return undef in case the person is putting the output directly into a print statement without testing it
180 # now $gffio is either false, or a valid GFF formatter
182 my ($GFF,$cigar,$score);
183 my ($resultfilter,$hitfilter,$hspfilter) = (
184 $self->filter('RESULT'),
185 $self->filter('HIT'),
186 $self->filter('HSP'));
187 $result->can('rewind') && $result->rewind(); # ensure we're at the beginning
188 next if (defined $resultfilter && ! (&{$resultfilter}($result)) );
190 while( my $hit = $result->next_hit ) {
192 if (defined $self->{_evalue}){
193 next unless ($hit->significance < $self->{_evalue});
195 next if( defined $hitfilter && ! &{$hitfilter}($hit) ); # test against filter code
197 my $refseq = $reference eq 'hit' ? $hit->name : $result->query_name;
198 my $seqname = $reference eq 'hit' ? $result->query_name : $hit->name; # hopefully this will be a simple identifier without a full description line!!
199 if ($self->{_signif}) {
200 $score = $hit->significance;
201 } else {
202 $score = $hit->raw_score;
204 $self->throw("No reference sequence name found in hit; required for GFF (this may not be your fault if your report type does not include reference sequence names)\n") unless $refseq;
205 my $source = $hit->algorithm;
206 $self->throw("No algorithm name found in hit; required for GFF (this may not be your fault if your report type does not include algorithm names)\n") unless $refseq;
207 $self->throw("This module only works on BLASTN reports at this time. Sorry.\n") unless $source eq "BLASTN";
209 my @plus_hsps;
210 my @minus_hsps;
212 # pre-process the HSP's because we later need to know
213 # the extents of the plus and munus strand
214 # on both the subject and query strands individually
215 my ($qpmin, $qpmax, $qmmin, $qmmax, $spmin, $spmax, $smmin, $smmax); # variables for the plus/minus strand min start and max end to know the full extents of the hit
216 while( my $hsp = $hit->next_hsp ) {
217 if ( defined $self->{_evalue} ) {
218 # for backward compatibility only
219 next unless ($hsp->significance < $self->{_evalue});
221 next if( defined $hspfilter && ! &{$hspfilter}($hsp) ); # test against HSP filter
222 if ($hsp->hit->strand >= 0 ){
223 push @plus_hsps, $hsp;
224 if (defined $qpmin){ # set or reset the minimum and maximum extent of the plus-strand hit
225 $qpmin = $hsp->query->start if $hsp->query->start < $qpmin;
226 $qpmax = $hsp->query->end if $hsp->query->end > $qpmax;
227 $spmin = $hsp->hit->start if $hsp->hit->start < $spmin;
228 $spmax = $hsp->hit->end if $hsp->hit->end > $spmax;
229 } else {
230 $qpmin = $hsp->query->start;
231 $qpmax = $hsp->query->end;
232 $spmin = $hsp->hit->start;
233 $spmax = $hsp->hit->end;
236 if ($hsp->hit->strand < 0 ){
237 push @minus_hsps, $hsp;
238 if (defined $qmmin){ # set or reset the minimum and maximum extent of the minus-strand hit
239 $qmmin = $hsp->query->start if $hsp->query->start < $qmmin;
240 $qmmax = $hsp->query->end if $hsp->query->end > $qmmax;
241 $smmin = $hsp->hit->start if $hsp->hit->start < $smmin;
242 $smmax = $hsp->hit->end if $hsp->hit->end > $smmax;
243 } else {
244 $qmmin = $hsp->query->start;
245 $qmmax = $hsp->query->end;
246 $smmin = $hsp->hit->start;
247 $smmax = $hsp->hit->end;
250 #else next if there is no strand, but that makes no sense..??
252 next unless (scalar(@plus_hsps) + scalar(@minus_hsps)); # next if no hsps (??)
253 my $ID = $self->_incrementHIT();
254 # okay, write out the index line for the entire hit before
255 # processing HSP's
256 # unfortunately (or not??), HitI objects do not implement
257 # SeqFeatureI, so we can't just call ->gff_string
258 # as a result, this module is quite brittle to changes
259 # in the GFF format since we are hard-coding the GFF output here :-(
261 if (scalar(@plus_hsps)){
262 my %tags = ( 'ID' => "match_sequence$ID");
264 if ($format==2.5) {
265 $tags{'Target'} = "$prefix:$seqname";
266 $tags{'tstart'} = $qmmin;
267 $tags{'tend'} = $qmmax;
268 } else {
269 $tags{'Target'} = "$prefix:$seqname $qpmin $qpmax";
271 if ( $self->{'_cigar'} ) {
272 $tags{'Gap'} = $cigar;
274 my $feat = Bio::SeqFeature::Generic->new(
275 -seq_id => $refseq,
276 -source_tag => $source,
277 -primary_tag => $match_tag,
278 -start => $spmin,
279 -end => $spmax,
280 -score => $score,
281 -strand => '+',
282 -frame => '.',
283 -tag => \%tags
287 my $formatter = Bio::Tools::GFF->new(-gff_version => $format);
288 $GFF .= $feat->gff_string($formatter)."\n";
290 if (scalar(@minus_hsps)){
291 my %tags = ( 'ID' => "match_sequence$ID");
293 if ($format==2.5) {
294 $tags{'Target'} = "$prefix:$seqname";
295 $tags{'tstart'} = $qpmax;
296 $tags{'tend'} = $qpmin;
298 else {
299 $tags{'Target'} = "$prefix:$seqname $qpmax $qpmin";
301 my $feat = Bio::SeqFeature::Generic->new(
302 -seq_id => $refseq,
303 -source_tag => $source,
304 -primary_tag => $match_tag,
305 -start => $smmin,
306 -end => $smmax,
307 -score => $score,
308 -strand => '-',
309 -frame => '.',
310 -tag => \%tags
313 my $formatter = Bio::Tools::GFF->new(-gff_version => $format);
314 $GFF .= $feat->gff_string($formatter)."\n";
317 # process + strand hsps
318 foreach my $hsp (@plus_hsps){
319 my $hspID = $self->_incrementHSP();
320 my $qstart = $hsp->query->start;
321 my $qend = $hsp->query->end;
322 my $sstart = $hsp->hit->start;
323 my $send = $hsp->hit->end;
324 my $score = $hsp->score;
326 my %tags = ( 'ID' => "match_hsp$hspID",
327 'Parent' => "match_sequence$ID" );
329 if ($format==2.5) {
330 $tags{'Target'} = "$prefix:$seqname";
331 $tags{'tstart'} = $qstart;
332 $tags{'tend'} = $qend;
334 else {
335 $tags{'Target'} = "$prefix:$seqname $qstart $qend";
337 if ( $self->{'_cigar'} ) {
338 $tags{'Gap'} = $hsp->cigar_string;
341 my $feat = Bio::SeqFeature::Generic->new(
342 -seq_id => $refseq,
343 -source_tag => $source,
344 -primary_tag => $hsp_tag,
345 -start => $sstart,
346 -end => $send,
347 -score => $score,
348 -strand => '+',
349 -frame => '.',
350 -tag => \%tags
353 my $formatter = Bio::Tools::GFF->new(-gff_version => $format);
354 $GFF .= $feat->gff_string($formatter)."\n";
357 foreach my $hsp (@minus_hsps) {
358 my $hspID = $self->_incrementHSP();
359 my $qstart = $hsp->query->start;
360 my $qend = $hsp->query->end;
361 my $sstart = $hsp->hit->start;
362 my $send = $hsp->hit->end;
363 my $score = $hsp->score;
365 my %tags = ( 'ID' => "match_hsp$hspID",
366 'Parent' => "match_sequence$ID" );
368 if ($format==2.5) {
369 $tags{'Target'} = "$prefix:$seqname";
370 $tags{'tstart'} = $qend;
371 $tags{'tend'} = $qstart;
373 else {
374 $tags{'Target'} = "$prefix:$seqname $qend $qstart";
376 if ( $self->{'_cigar'} ) {
377 $tags{'Gap'} = $hsp->cigar_string;
380 my $feat = Bio::SeqFeature::Generic->new(
381 -seq_id => $refseq,
382 -source_tag => $source,
383 -primary_tag => $hsp_tag,
384 -start => $sstart,
385 -end => $send,
386 -score => $score,
387 -strand => '-',
388 -frame => '.',
389 -tag => \%tags
392 my $formatter = Bio::Tools::GFF->new(-gff_version => $format);
393 $GFF .= $feat->gff_string($formatter) ."\n";
396 return $GFF;
399 sub significance_filter {
400 my ($self,$method,$code) = @_;
401 return unless $method;
402 $method = uc($method);
403 if( $method ne 'HSP' &&
404 $method ne 'HIT' &&
405 $method ne 'RESULT' ) {
406 $self->warn("Unknown method $method");
407 return;
409 if( $code ) {
410 $self->throw("Must provide a valid code reference") unless ref($code) =~ /CODE/;
411 $self->{$method} = $code;
413 return $self->{$method};
416 =head2 start_report
418 Title : start_report
419 Usage : $self->start_report()
420 Function: has no function, returns nothing
421 Returns : empty string
422 Args : none
424 =cut
426 sub start_report { return '' }
428 =head2 end_report
430 Title : end_report
431 Usage : $self->end_report()
432 Function: has no function, returns nothing
433 Returns : empty string
434 Args : none
437 =cut
439 sub end_report { return '' }
441 =head2 filter
443 Title : filter
444 Usage : $writer->filter('hsp', \&hsp_filter);
445 Function: Filter out either at HSP,Hit,or Result level
446 Returns : none
447 Args : string => data type,
448 CODE reference
449 Note : GbrowseGFF.pm makes no changes to the default filter code
452 =cut