Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / SearchIO / axt.pm
bloba48b30dd4a47d4c9a5047b40d10f708d6d10a39a
2 # BioPerl module for Bio::SearchIO::axt
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl.org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SearchIO::axt - a parser for axt format reports
18 =head1 SYNOPSIS
20 use Bio::SearchIO;
21 my $parser = Bio::SearchIO->new(-format => 'axt',
22 -file => 't/data/report.blastz');
23 while( my $result = $parser->next_result ) {
24 while( my $hit = $result->next_hit ) {
25 while( my $hsp = $hit->next_hsp ) {
30 =head1 DESCRIPTION
32 This is a parser and event-generator for AXT format reports. BLASTZ
33 reports (Schwartz et al,(2003) Genome Research, 13:103-107) are normally
34 in LAV format but are commonly post-processed to AXT format; many precomputed
35 BLASTZ reports, such as those found in the UCSC Genome
36 Browser, are in AXT format. This parser will also parse any
37 AXT format produced from any lav report and directly out of BLAT.
39 =head1 FEEDBACK
41 =head2 Mailing Lists
43 User feedback is an integral part of the evolution of this and other
44 Bioperl modules. Send your comments and suggestions preferably to
45 the Bioperl mailing list. Your participation is much appreciated.
47 bioperl-l@bioperl.org - General discussion
48 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
50 =head2 Support
52 Please direct usage questions or support issues to the mailing list:
54 I<bioperl-l@bioperl.org>
56 rather than to the module maintainer directly. Many experienced and
57 reponsive experts will be able look at the problem and quickly
58 address it. Please include a thorough description of the problem
59 with code and data examples if at all possible.
61 =head2 Reporting Bugs
63 Report bugs to the Bioperl bug tracking system to help us keep track
64 of the bugs and their resolution. Bug reports can be submitted via the
65 web:
67 https://github.com/bioperl/bioperl-live/issues
69 =head1 AUTHOR - Jason Stajich
71 Email jason-at-bioperl.org
73 =head1 APPENDIX
75 The rest of the documentation details each of the object methods.
76 Internal methods are usually preceded with a _
78 =cut
81 # Let the code begin...
84 package Bio::SearchIO::axt;
85 use vars qw(%MODEMAP %MAPPING @STATES $GAPCHAR);
86 use strict;
88 use Bio::Search::Result::ResultFactory;
89 use Bio::Search::HSP::HSPFactory;
90 use base qw(Bio::SearchIO);
92 use POSIX;
94 BEGIN {
95 # mapping of NCBI Blast terms to Bioperl hash keys
96 %MODEMAP = ('AXTOutput' => 'result',
97 'Hit' => 'hit',
98 'Hsp' => 'hsp'
100 $GAPCHAR = '-';
101 %MAPPING =
103 'Hsp_score' => 'HSP-score',
104 'Hsp_query-from' => 'HSP-query_start',
105 'Hsp_query-to' => 'HSP-query_end',
106 'Hsp_hit-from' => 'HSP-hit_start',
107 'Hsp_hit-to' => 'HSP-hit_end',
108 'Hsp_positive' => 'HSP-conserved',
109 'Hsp_identity' => 'HSP-identical',
110 'Hsp_gaps' => 'HSP-hsp_gaps',
111 'Hsp_hitgaps' => 'HSP-hit_gaps',
112 'Hsp_querygaps' => 'HSP-query_gaps',
113 'Hsp_qseq' => 'HSP-query_seq',
114 'Hsp_hseq' => 'HSP-hit_seq',
115 'Hsp_midline' => 'HSP-homology_seq', # ignoring this for now
116 'Hsp_align-len' => 'HSP-hsp_length',
118 'Hit_id' => 'HIT-name',
119 'AXTOutput_query-def'=> 'RESULT-query_name',
123 =head2 new
125 Title : new
126 Usage : my $obj = Bio::SearchIO::axt->new();
127 Function: Builds a new Bio::SearchIO::axt object
128 Returns : an instance of Bio::SearchIO::axt
129 Args :
132 =cut
134 sub new {
135 my($class,@args) = @_;
136 my $self = $class->SUPER::new(@args);
137 return $self;
140 =head2 next_result
142 Title : next_result
143 Usage : my $hit = $searchio->next_result;
144 Function: Returns the next Result from a search
145 Returns : Bio::Search::Result::ResultI object
146 Args : none
148 =cut
150 sub next_result{
151 my ($self) = @_;
152 local $/ = "\n";
153 local $_;
155 my ($curquery,$curhit);
156 $self->start_document();
157 my @hit_signifs;
158 while( defined ($_ = $self->_readline )) {
159 next if (/^\s+$/);
160 if( m/^(\d+)\s+ # alignment number - we'll throw this away anyways
161 (\S+)\s+ # Query name
162 (\d+)\s+(\d+)\s+ # Query start Query end (always + strand, 0 based)
163 (\S+)\s+ # Hit name
164 (\d+)\s+(\d+)\s+ # Hit start Hit end (0 based)
165 ([\-\+])\s+ # Hit strand
166 ([\d\.\-]+)\s+ # Score
167 /ox ) {
168 my ($alnnum, $qname,$qstart,$qend, $hname,
169 $hstart,$hend,$hstrand, $score) = ($1,$2,$3,$4,$5,
170 $6,$7,$8,$9);
171 $self->{'_reporttype'} = 'AXT';
172 # Jim's code is 0 based
173 # yes, but axt format is one-based, see bug 3145 - cjfields 10-11-10
174 #$qstart++; $qend++; $hstart++; $hend++;
175 if( defined $curquery && $curquery ne $qname ) {
176 $self->end_element({'Name' => 'Hit'});
177 $self->_pushback($_);
178 $self->end_element({'Name' => 'AXTOutput'});
179 return $self->end_document();
182 if( defined $curhit &&
183 $curhit ne $hname) {
184 # slight duplication here -- keep these in SYNC
185 $self->end_element({'Name' => 'Hit'});
186 $self->start_element({'Name' => 'Hit'});
187 $self->element({'Name' => 'Hit_id',
188 'Data' => $hname});
189 } elsif ( ! defined $curquery ) {
190 $self->start_element({'Name' => 'AXTOutput'});
191 $self->{'_result_count'}++;
192 $self->element({'Name' => 'AXTOutput_query-def',
193 'Data' => $qname });
195 $self->start_element({'Name' => 'Hit'});
196 $self->element({'Name' => 'Hit_id',
197 'Data' => $hname});
199 $self->start_element({'Name' => 'Hsp'});
200 my $queryalign = $self->_readline;
201 my $hitalign = $self->_readline;
202 chomp($queryalign);
203 chomp($hitalign);
204 my $alnlen = length($queryalign);
205 my $qgapnum = ( $queryalign =~ s/\Q$GAPCHAR/$GAPCHAR/g);
206 my $hgapnum = ( $hitalign =~ s/\Q$GAPCHAR/$GAPCHAR/g);
207 my $totalgaps = ($qgapnum + $hgapnum);
209 if( $hstrand eq '-' ) { # strand gets inferred by start/end
210 ($hstart,$hend) = ($hend,$hstart);
212 $self->element({'Name' => 'Hsp_score',
213 'Data' => $score});
214 $self->element({'Name' => 'Hsp_query-from',
215 'Data' => $qstart});
216 $self->element({'Name' => 'Hsp_query-to',
217 'Data' => $qend});
218 $self->element({'Name' => 'Hsp_hit-from',
219 'Data' => $hstart});
220 $self->element({'Name' => 'Hsp_hit-to',
221 'Data' => $hend});
222 $self->element({'Name' => 'Hsp_gaps',
223 'Data' => $qgapnum + $hgapnum});
224 $self->element({'Name' => 'Hsp_querygaps',
225 'Data' => $qgapnum});
226 $self->element({'Name' => 'Hsp_hitgaps',
227 'Data' => $hgapnum});
229 $self->element({'Name' => 'Hsp_identity',
230 'Data' => $alnlen - $totalgaps});
231 $self->element({'Name' => 'Hsp_positive',
232 'Data' => $alnlen - $totalgaps});
233 $self->element({'Name' => 'Hsp_qseq',
234 'Data' => $queryalign});
235 $self->element({'Name' => 'Hsp_hseq',
236 'Data' => $hitalign});
238 $self->end_element({'Name' => 'Hsp'});
239 $curquery = $qname;
240 $curhit = $hname;
243 # fence post
244 if( defined $curquery ) {
245 $self->end_element({'Name' => 'Hit'});
246 $self->end_element({'Name' => 'AXTOutput'});
247 return $self->end_document();
249 return;
252 sub _initialize {
253 my ($self,@args) = @_;
254 $self->SUPER::_initialize(@args);
255 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::GenericResult'));
257 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::GenericHSP'));
261 =head2 start_element
263 Title : start_element
264 Usage : $eventgenerator->start_element
265 Function: Handles a start element event
266 Returns : none
267 Args : hashref with at least 2 keys 'Data' and 'Name'
270 =cut
272 sub start_element{
273 my ($self,$data) = @_;
274 # we currently don't care about attributes
275 my $nm = $data->{'Name'};
276 if( my $type = $MODEMAP{$nm} ) {
277 $self->_mode($type);
278 if( $self->_eventHandler->will_handle($type) ) {
279 my $func = sprintf("start_%s",lc $type);
280 $self->_eventHandler->$func($data->{'Attributes'});
282 unshift @{$self->{'_elements'}}, $type;
284 if($nm eq 'AXTOutput') {
285 $self->{'_values'} = {};
286 $self->{'_result'}= undef;
287 $self->{'_mode'} = '';
292 =head2 end_element
294 Title : start_element
295 Usage : $eventgenerator->end_element
296 Function: Handles an end element event
297 Returns : none
298 Args : hashref with at least 2 keys 'Data' and 'Name'
301 =cut
303 sub end_element {
304 my ($self,$data) = @_;
305 my $nm = $data->{'Name'};
306 my $rc;
307 # Hsp are sort of weird, in that they end when another
308 # object begins so have to detect this in end_element for now
310 if( my $type = $MODEMAP{$nm} ) {
311 if( $self->_eventHandler->will_handle($type) ) {
312 my $func = sprintf("end_%s",lc $type);
313 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
314 $self->{'_values'});
316 shift @{$self->{'_elements'}};
318 } elsif( $MAPPING{$nm} ) {
319 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
320 my $key = (keys %{$MAPPING{$nm}})[0];
321 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
322 } else {
323 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
325 } else {
326 $self->warn( "unknown nm $nm ignoring\n");
328 $self->{'_last_data'} = ''; # remove read data if we are at
329 # end of an element
330 $self->{'_result'} = $rc if( $nm eq 'AXTOutput' );
331 return $rc;
335 =head2 element
337 Title : element
338 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
339 Function: Convience method that calls start_element, characters, end_element
340 Returns : none
341 Args : Hash ref with the keys 'Name' and 'Data'
344 =cut
346 sub element{
347 my ($self,$data) = @_;
348 $self->start_element($data);
349 $self->characters($data);
350 $self->end_element($data);
354 =head2 characters
356 Title : characters
357 Usage : $eventgenerator->characters($str)
358 Function: Send a character events
359 Returns : none
360 Args : string
363 =cut
365 sub characters{
366 my ($self,$data) = @_;
368 return unless ( defined $data->{'Data'} );
369 if( $data->{'Data'} =~ /^\s+$/ ) {
370 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
373 if( $self->in_element('hsp') &&
374 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
376 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
379 $self->{'_last_data'} = $data->{'Data'};
382 =head2 _mode
384 Title : _mode
385 Usage : $obj->_mode($newval)
386 Function:
387 Example :
388 Returns : value of _mode
389 Args : newvalue (optional)
392 =cut
394 sub _mode{
395 my ($self,$value) = @_;
396 if( defined $value) {
397 $self->{'_mode'} = $value;
399 return $self->{'_mode'};
402 =head2 within_element
404 Title : within_element
405 Usage : if( $eventgenerator->within_element($element) ) {}
406 Function: Test if we are within a particular element
407 This is different than 'in' because within can be tested
408 for a whole block.
409 Returns : boolean
410 Args : string element name
413 =cut
415 sub within_element{
416 my ($self,$name) = @_;
417 return 0 if ( ! defined $name &&
418 ! defined $self->{'_elements'} ||
419 scalar @{$self->{'_elements'}} == 0) ;
420 foreach ( @{$self->{'_elements'}} ) {
421 if( $_ eq $name ) {
422 return 1;
425 return 0;
428 =head2 in_element
430 Title : in_element
431 Usage : if( $eventgenerator->in_element($element) ) {}
432 Function: Test if we are in a particular element
433 This is different than 'in' because within can be tested
434 for a whole block.
435 Returns : boolean
436 Args : string element name
439 =cut
441 sub in_element{
442 my ($self,$name) = @_;
443 return 0 if ! defined $self->{'_elements'}->[0];
444 return ( $self->{'_elements'}->[0] eq $name)
448 =head2 start_document
450 Title : start_document
451 Usage : $eventgenerator->start_document
452 Function: Handles a start document event
453 Returns : none
454 Args : none
457 =cut
459 sub start_document{
460 my ($self) = @_;
461 $self->{'_lasttype'} = '';
462 $self->{'_values'} = {};
463 $self->{'_result'}= undef;
464 $self->{'_mode'} = '';
465 $self->{'_elements'} = [];
469 =head2 end_document
471 Title : end_document
472 Usage : $eventgenerator->end_document
473 Function: Handles an end document event
474 Returns : Bio::Search::Result::ResultI object
475 Args : none
478 =cut
480 sub end_document{
481 my ($self,@args) = @_;
482 return $self->{'_result'};
485 =head2 result_count
487 Title : result_count
488 Usage : my $count = $searchio->result_count
489 Function: Returns the number of results we have processed
490 Returns : integer
491 Args : none
494 =cut
496 sub result_count {
497 my $self = shift;
498 return $self->{'_result_count'};
501 sub report_count { shift->result_count }