maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / SearchIO / psl.pm
blobf166486ba14d40d1d7fc3ca2c817d3a28f9f358c
2 # BioPerl module for Bio::SearchIO::psl
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Jason Stajich <jason-at-bioperl-dot-org>
8 # Copyright Jason Stajich
10 # You may distribute this module under the same terms as perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::SearchIO::psl - A parser for PSL output (UCSC)
18 =head1 SYNOPSIS
20 use Bio::SearchIO;
21 my $parser = Bio::SearchIO->new(-file => 'file.psl',
22 -format => 'psl');
23 while( my $result = $parser->next_result ) {
26 =head1 DESCRIPTION
28 This is a SearchIO driver for PSL format.
29 PSL format is documented here:
30 http://genome.ucsc.edu/goldenPath/help/customTrack.html#PSL
32 By default it assumes PSL output came from BLAT you can override that
33 by specifying -program_name =E<gt> 'BLASTZ' when initializing the
34 SearchIO object.
36 =head1 FEEDBACK
38 =head2 Mailing Lists
40 User feedback is an integral part of the evolution of this and other
41 Bioperl modules. Send your comments and suggestions preferably to
42 the Bioperl mailing list. Your participation is much appreciated.
44 bioperl-l@bioperl.org - General discussion
45 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47 =head2 Support
49 Please direct usage questions or support issues to the mailing list:
51 I<bioperl-l@bioperl.org>
53 rather than to the module maintainer directly. Many experienced and
54 reponsive experts will be able look at the problem and quickly
55 address it. Please include a thorough description of the problem
56 with code and data examples if at all possible.
58 =head2 Reporting Bugs
60 Report bugs to the Bioperl bug tracking system to help us keep track
61 of the bugs and their resolution. Bug reports can be submitted via the
62 web:
64 https://github.com/bioperl/bioperl-live/issues
66 =head1 AUTHOR - Jason Stajich
68 Email jason-at-bioperl-dot-org
70 =head1 APPENDIX
72 The rest of the documentation details each of the object methods.
73 Internal methods are usually preceded with a _
75 =cut
77 # Let the code begin...
79 package Bio::SearchIO::psl;
80 use vars qw(%MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $DefaultProgramName);
82 use strict;
83 use Bio::Search::HSP::HSPFactory;
84 use Bio::Search::Hit::HitFactory;
85 use Bio::Search::Result::ResultFactory;
87 $DefaultProgramName = 'BLAT';
88 $DEFAULT_WRITER_CLASS = 'Bio::SearchIO::Writer::HitTableWriter';
90 # mapping of terms to Bioperl hash keys
91 %MODEMAP = (
92 'PSLOutput' => 'result',
93 'Result' => 'result',
94 'Hit' => 'hit',
95 'Hsp' => 'hsp'
98 %MAPPING = (
99 'Hsp_bit-score' => 'HSP-bits',
100 'Hsp_score' => 'HSP-score',
101 'Hsp_evalue' => 'HSP-evalue',
102 'Hsp_query-from' => 'HSP-query_start',
103 'Hsp_query-to' => 'HSP-query_end',
104 'Hsp_hit-from' => 'HSP-hit_start',
105 'Hsp_hit-to' => 'HSP-hit_end',
106 'Hsp_positive' => 'HSP-conserved',
107 'Hsp_identity' => 'HSP-identical',
108 'Hsp_mismatches' => 'HSP-mismatches',
109 'Hsp_qgapblocks' => 'HSP-query_gapblocks',
110 'Hsp_hgapblocks' => 'HSP-hit_gapblocks',
111 'Hsp_gaps' => 'HSP-hsp_gaps',
112 'Hsp_hitgaps' => 'HSP-hit_gaps',
113 'Hsp_querygaps' => 'HSP-query_gaps',
114 'Hsp_align-len' => 'HSP-hsp_length',
115 'Hsp_query-frame' => 'HSP-query_frame',
116 'Hsp_hit-frame' => 'HSP-hit_frame',
118 'Hit_id' => 'HIT-name',
119 'Hit_len' => 'HIT-length',
120 'Hit_accession' => 'HIT-accession',
121 'Hit_def' => 'HIT-description',
122 'Hit_signif' => 'HIT-significance',
123 'Hit_score' => 'HIT-score',
124 'Hit_bits' => 'HIT-bits',
126 'PSLOutput_program' => 'RESULT-algorithm_name',
127 'PSLOutput_version' => 'RESULT-algorithm_version',
128 'PSLOutput_query-def' => 'RESULT-query_name',
129 'PSLOutput_query-len' => 'RESULT-query_length',
130 'PSLOutput_query-acc' => 'RESULT-query_accession',
131 'PSLOutput_querydesc' => 'RESULT-query_description',
132 'PSLOutput_db' => 'RESULT-database_name',
133 'PSLOutput_db-len' => 'RESULT-database_entries',
134 'PSLOutput_db-let' => 'RESULT-database_letters',
137 use base qw(Bio::SearchIO);
139 =head2 new
141 Title : new
142 Usage : my $obj = Bio::SearchIO::psl->new();
143 Function: Builds a new Bio::SearchIO::psl object
144 Returns : an instance of Bio::SearchIO::psl
145 Args :
148 =cut
150 sub _initialize {
151 my ( $self, @args ) = @_;
152 $self->SUPER::_initialize(@args);
153 my ($pname) = $self->_rearrange( [qw(PROGRAM_NAME)], @args );
154 $self->program_name( $pname || $DefaultProgramName );
155 $self->_eventHandler->register_factory(
156 'result',
157 Bio::Search::Result::ResultFactory->new(
158 -type => 'Bio::Search::Result::GenericResult'
162 $self->_eventHandler->register_factory(
163 'hit',
164 Bio::Search::Hit::HitFactory->new(
165 -type => 'Bio::Search::Hit::GenericHit'
168 $self->_eventHandler->register_factory(
169 'hsp',
170 Bio::Search::HSP::HSPFactory->new(
171 -type => 'Bio::Search::HSP::PSLHSP'
176 =head2 next_result
178 Title : next_result
179 Usage : my $result = $parser->next_result
180 Function: Parse the next result from the data stream
181 Returns : L<Bio::Search::Result::ResultI> or undef if no more results
182 Args : none
184 =cut
186 sub next_result {
187 my ($self) = @_;
188 my ( $lastquery, $lasthit );
189 local $/ = "\n";
190 local $_;
192 # skip over any header lines
193 while( defined($_ = $self->_readline) and ! /^\d+\s+\d+\s+/ ) {}
194 $self->_pushback($_);
196 # now start the main parsing loop
197 while ( defined( $_ = $self->_readline ) ) {
198 my (
199 $matches, $mismatches, $rep_matches, $n_count,
200 $q_num_insert, $q_base_insert, $t_num_insert, $t_base_insert,
201 $strand, $q_name, $q_length, $q_start,
202 $q_end, $t_name, $t_length, $t_start,
203 $t_end, $block_count, $block_sizes, $q_starts,
204 $t_starts
205 ) = split;
207 $q_length > 0 or $self->throw("parse error, invalid query length '$q_length'");
208 my $score = sprintf( "%.2f", 100 * ( $matches + $mismatches + $rep_matches ) / $q_length );
210 # this is overall percent identity...
211 my $match_total = $matches + $mismatches + $rep_matches;
212 $match_total > 0
213 or $self->throw("parse error, matches + mismatches + rep_matches must be > 0!");
214 my $percent_id = sprintf("%.2f", 100 * ( $matches + $rep_matches ) / $match_total );
216 # Remember Jim's code is 0 based
217 if ( defined $lastquery
218 && $lastquery ne $q_name )
220 $self->end_element( { 'Name' => 'Hit' } );
221 $self->end_element( { 'Name' => 'PSLOutput' } );
222 $self->_pushback($_);
223 return $self->end_document;
225 elsif ( !defined $lastquery ) {
226 $self->{'_result_count'}++;
227 $self->start_element( { 'Name' => 'PSLOutput' } );
228 $self->element(
230 'Name' => 'PSLOutput_program',
231 'Data' => $self->program_name
234 $self->element(
236 'Name' => 'PSLOutput_query-def',
237 'Data' => $q_name
240 $self->element(
242 'Name' => 'PSLOutput_query-len',
243 'Data' => $q_length
246 $self->start_element( { 'Name' => 'Hit' } );
247 $self->element(
249 'Name' => 'Hit_id',
250 'Data' => $t_name
253 $self->element(
255 'Name' => 'Hit_len',
256 'Data' => $t_length
259 $self->element(
261 'Name' => 'Hit_score',
262 'Data' => $score
266 elsif ( $lasthit ne $t_name ) {
267 $self->end_element( { 'Name' => 'Hit' } );
268 $self->start_element( { 'Name' => 'Hit' } );
269 $self->element(
271 'Name' => 'Hit_id',
272 'Data' => $t_name
275 $self->element(
277 'Name' => 'Hit_len',
278 'Data' => $t_length
281 $self->element(
283 'Name' => 'Hit_score',
284 'Data' => $score
289 my $identical = $matches + $rep_matches;
290 $self->start_element( { 'Name' => 'Hsp' } );
291 $self->element(
293 'Name' => 'Hsp_score',
294 'Data' => $score
297 $self->element(
299 'Name' => 'Hsp_identity',
300 'Data' => $identical
303 $self->element(
305 'Name' => 'Hsp_positive',
306 'Data' => $identical
309 $self->element(
311 'Name' => 'Hsp_mismatches',
312 'Data' => $mismatches
315 $self->element(
317 'Name' => 'Hsp_gaps',
318 'Data' => $q_base_insert + $t_base_insert
322 # query gaps are the number of target inserts and vice-versa
323 $self->element(
325 'Name' => 'Hsp_querygaps',
326 'Data' => $t_base_insert
329 $self->element(
331 'Name' => 'Hsp_hitgaps',
332 'Data' => $q_base_insert
335 if ( $strand eq '+' ) {
336 $self->element(
338 'Name' => 'Hsp_query-from',
339 'Data' => $q_start + 1
342 $self->element(
344 'Name' => 'Hsp_query-to',
345 'Data' => $q_end
349 else {
350 $self->element(
352 'Name' => 'Hsp_query-to',
353 'Data' => $q_start + 1
356 $self->element(
358 'Name' => 'Hsp_query-from',
359 'Data' => $q_end
364 my $hsplen =
365 ($q_base_insert +
366 $t_base_insert +
367 abs( $t_end - $t_start ) +
368 abs( $q_end - $q_start ))/2;
370 $self->element(
372 'Name' => 'Hsp_hit-from',
373 'Data' => $t_start + 1
376 $self->element(
378 'Name' => 'Hsp_hit-to',
379 'Data' => $t_end
382 $self->element(
384 'Name' => 'Hsp_align-len',
385 'Data' => $hsplen
389 # cleanup trailing commas in some output
390 $block_sizes =~ s/\,$//;
391 $q_starts =~ s/\,$//;
392 $t_starts =~ s/\,$//;
393 my @blocksizes = split( /,/, $block_sizes ); # block sizes
394 my @qstarts = split( /,/, $q_starts ); # starting position of each block
395 # in query
396 my @tstarts = split( /,/, $t_starts ); # starting position of each block
397 # in target
398 my ( @qgapblocks, @hgapblocks );
400 for ( my $i = 0 ; $i < $block_count ; $i++ ) {
401 if ( $strand eq '+' ) {
402 push @qgapblocks, [ $qstarts[$i] + 1, $blocksizes[$i] ];
404 else {
405 push @qgapblocks, [ $q_length - $qstarts[$i], $blocksizes[$i] ];
407 push @hgapblocks, [ $tstarts[$i] + 1, $blocksizes[$i] ];
409 $self->element(
411 'Name' => 'Hsp_qgapblocks',
412 'Data' => \@qgapblocks
415 $self->element(
417 'Name' => 'Hsp_hgapblocks',
418 'Data' => \@hgapblocks
421 $self->end_element( { 'Name' => 'Hsp' } );
422 $lastquery = $q_name;
423 $lasthit = $t_name;
425 if ( defined $lasthit || defined $lastquery ) {
426 $self->end_element( { 'Name' => 'Hit' } );
427 $self->end_element( { 'Name' => 'Result' } );
428 return $self->end_document;
432 =head2 start_element
434 Title : start_element
435 Usage : $eventgenerator->start_element
436 Function: Handles a start element event
437 Returns : none
438 Args : hashref with at least 2 keys 'Data' and 'Name'
441 =cut
443 sub start_element {
444 my ( $self, $data ) = @_;
446 # we currently don't care about attributes
447 my $nm = $data->{'Name'};
448 if ( my $type = $MODEMAP{$nm} ) {
449 $self->_mode($type);
450 if ( $self->_eventHandler->will_handle($type) ) {
451 my $func = 'start_'.lc $type;
452 $self->_eventHandler->$func( $data->{'Attributes'} );
454 unshift @{ $self->{'_elements'} }, $type;
456 if ( $nm eq 'PSLOutput' ) {
457 $self->{'_values'} = {};
458 $self->{'_result'} = undef;
459 $self->{'_mode'} = '';
464 =head2 end_element
466 Title : end_element
467 Usage : $eventgenerator->end_element
468 Function: Handles an end element event
469 Returns : return value from the associated end_$type event handler
470 Args : hashref with at least 2 keys 'Data' and 'Name'
473 =cut
475 sub end_element {
476 my ( $self, $data ) = @_;
477 my $nm = $data->{'Name'};
478 my $rc;
480 # Hsp are sort of weird, in that they end when another
481 # object begins so have to detect this in end_element for now
483 if ( my $type = $MODEMAP{$nm} ) {
484 if ( $self->_eventHandler->will_handle($type) ) {
485 my $func = 'end_'.lc $type;
486 $rc = $self->_eventHandler->$func( $self->{'_reporttype'},
487 $self->{'_values'} );
489 shift @{ $self->{'_elements'} };
492 elsif ( $MAPPING{$nm} ) {
493 if ( ref( $MAPPING{$nm} ) =~ /hash/i ) {
494 my $key = ( keys %{ $MAPPING{$nm} } )[0];
495 $self->{'_values'}->{$key}->{ $MAPPING{$nm}->{$key} } =
496 $self->{'_last_data'};
498 else {
499 $self->{'_values'}->{ $MAPPING{$nm} } = $self->{'_last_data'};
502 else {
503 $self->warn(
504 __PACKAGE__ . "::end_element: unknown nm '$nm', ignoring\n" );
506 $self->{'_last_data'} = ''; # remove read data if we are at
507 # end of an element
508 $self->{'_result'} = $rc
509 if ( defined $nm
510 && defined $MODEMAP{$nm}
511 && $MODEMAP{$nm} eq 'result' );
512 return $rc;
516 =head2 element
518 Title : element
519 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
520 Function: Convience method that calls start_element, characters, end_element
521 Returns : none
522 Args : Hash ref with the keys 'Name' and 'Data'
525 =cut
527 sub element {
528 my ( $self, $data ) = @_;
529 $self->start_element($data);
530 $self->characters($data);
531 $self->end_element($data);
534 =head2 characters
536 Title : characters
537 Usage : $eventgenerator->characters($str)
538 Function: Send a character events
539 Returns : none
540 Args : string
543 =cut
545 sub characters {
546 my ( $self, $data ) = @_;
548 return unless ( defined $data->{'Data'} );
549 if ( $data->{'Data'} =~ /^\s+$/ ) {
550 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
553 if ( $self->in_element('hsp')
554 && $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ )
557 $self->{'_last_hspdata'}->{ $data->{'Name'} } .= $data->{'Data'};
560 $self->{'_last_data'} = $data->{'Data'};
563 =head2 _mode
565 Title : _mode
566 Usage : $obj->_mode($newval)
567 Function:
568 Example :
569 Returns : value of _mode
570 Args : newvalue (optional)
573 =cut
575 sub _mode {
576 my ( $self, $value ) = @_;
577 if ( defined $value ) {
578 $self->{'_mode'} = $value;
580 return $self->{'_mode'};
583 =head2 within_element
585 Title : within_element
586 Usage : if( $eventgenerator->within_element($element) ) {}
587 Function: Test if we are within a particular element
588 This is different than 'in' because within can be tested
589 for a whole block.
590 Returns : boolean
591 Args : string element name
594 =cut
596 sub within_element {
597 my ( $self, $name ) = @_;
598 return 0
599 if (!defined $name && !defined $self->{'_elements'}
600 || scalar @{ $self->{'_elements'} } == 0 );
601 foreach ( @{ $self->{'_elements'} } ) {
602 if ( $_ eq $name ) {
603 return 1;
606 return 0;
609 =head2 in_element
611 Title : in_element
612 Usage : if( $eventgenerator->in_element($element) ) {}
613 Function: Test if we are in a particular element
614 This is different than 'in' because within can be tested
615 for a whole block.
616 Returns : boolean
617 Args : string element name
620 =cut
622 sub in_element {
623 my ( $self, $name ) = @_;
624 return 0 if !defined $self->{'_elements'}->[0];
625 return ( $self->{'_elements'}->[0] eq $name );
628 =head2 start_document
630 Title : start_document
631 Usage : $eventgenerator->start_document
632 Function: Handles a start document event
633 Returns : none
634 Args : none
637 =cut
639 sub start_document {
640 my ($self) = @_;
641 $self->{'_lasttype'} = '';
642 $self->{'_values'} = {};
643 $self->{'_result'} = undef;
644 $self->{'_mode'} = '';
645 $self->{'_elements'} = [];
648 =head2 end_document
650 Title : end_document
651 Usage : $eventgenerator->end_document
652 Function: Handles an end document event
653 Returns : Bio::Search::Result::ResultI object
654 Args : none
657 =cut
659 sub end_document {
660 my ( $self, @args ) = @_;
661 return $self->{'_result'};
664 =head2 result_count
666 Title : result_count
667 Usage : my $count = $searchio->result_count
668 Function: Returns the number of results we have processed
669 Returns : integer
670 Args : none
673 =cut
675 sub result_count {
676 my $self = shift;
677 return $self->{'_result_count'};
680 sub report_count { shift->result_count }
682 =head2 program_name
684 Title : program_name
685 Usage : $obj->program_name($newval)
686 Function: Get/Set the program name
687 Returns : value of program_name (a scalar)
688 Args : on set, new value (a scalar or undef, optional)
691 =cut
693 sub program_name {
694 my $self = shift;
696 $self->{'program_name'} = shift if @_;
697 return $self->{'program_name'} || $DefaultProgramName;