Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / Bio / SearchIO / gmap_f9.pm
blob3c2dd97fbd495722e6e783fc658a22d1e625b5c4
1 # $Id: gmap_f9.pm 15987 2009-08-18 21:08:55Z lstein $
3 # BioPerl module for Bio::SearchIO::gmap_f9
5 # Cared for by George Hartzell <hartzell@alerce.com>
7 # Copyright George Hartzell
9 # You may distribute this module under the same terms as perl itself
11 # POD documentation - main docs before the code
13 =head1 NAME
15 Bio::SearchIO::gmap_f9 - Event generator for parsing gmap reports (Z format)
17 =head1 SYNOPSIS
19 # Do not use this object directly - it is used as part of the
20 # Bio::SearchIO system.
22 use Bio::SearchIO;
23 my $searchio = Bio::SearchIO->new(-format => 'gmap',
24 -file => 't/data/her2.gmapz');
25 while( my $result = $searchio->next_result ) {
26 while( my $hit = $result->next_hit ) {
27 while( my $hsp = $hit->next_hsp ) {
28 # ...
34 =head1 DESCRIPTION
36 This object encapsulated the necessary methods for generating events
37 suitable for building Bio::Search objects from a GMAP "compressed"
38 report (from gmap run with -Z flag) Read the L<Bio::SearchIO> for more
39 information about how to use this.
41 =head2 REVERSE STRAND AND BIOPERL COORDINATES
43 I believe that I'm doing the correct thing when reporting hits on the
44 negative strand of the genome. In particular, I've compared the
45 "exons" this code generates with the set returned by ncbi's megablast
46 web service. NCBI's hsp's are ordered differently and have a
47 different genomic location (off by ~18,000,000 bases, padding?) but
48 the starts, ends, and lengths were similar and my strand handling
49 matches theirs. E.g.
51 CDNA GENOME
52 start end strand start end strand
54 blast
55 1913 2989 1 86236731 86237808 -1
56 1 475 1 86260509 86260983 -1
57 1510 1727 1 86240259 86240476 -1
58 841 989 1 86243034 86243182 -1
59 1381 1514 1 86240630 86240763 -1
60 989 1122 1 86242457 86242590 -1
61 599 729 1 86247470 86247600 -1
62 473 608 1 86259972 86260107 -1
63 1255 1382 1 86240837 86240964 -1
64 730 842 1 86244040 86244152 -1
65 1813 1921 1 86238123 86238231 -1
66 1725 1814 1 86239747 86239836 -1
67 1167 1256 1 86241294 86241383 -1
68 1120 1188 1 86242319 86242387 -1
70 gmap
71 1 475 1 104330509 104330983 -1
72 476 600 1 104329980 104330104 -1
73 601 729 1 104317470 104317598 -1
74 730 841 1 104314041 104314152 -1
75 842 989 1 104313034 104313181 -1
76 990 1121 1 104312458 104312589 -1
77 1122 1187 1 104312320 104312385 -1
78 1188 1256 1 104311294 104311362 -1
79 1257 1382 1 104310837 104310962 -1
80 1383 1511 1 104310633 104310761 -1
81 1512 1726 1 104310260 104310474 -1
82 1727 1814 1 104309747 104309834 -1
83 1815 1917 1 104308127 104308229 -1
84 1918 2989 1 104306731 104307802 -1
86 =head1 FEEDBACK
88 =head2 Mailing Lists
90 User feedback is an integral part of the evolution of this and other
91 Bioperl modules. Send your comments and suggestions preferably to
92 the Bioperl mailing list. Your participation is much appreciated.
94 bioperl-l@bioperl.org - General discussion
95 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
97 =head2 Reporting Bugs
99 Report bugs to the Bioperl bug tracking system to help us keep track
100 of the bugs and their resolution. Bug reports can be submitted via
101 the web:
103 https://github.com/bioperl/bioperl-live/issues
105 =head1 AUTHOR - George Hartzell
107 Email hartzell@alerce.com
109 =head1 CONTRIBUTORS
111 Additional contributors names and emails here
113 =head1 APPENDIX
115 The rest of the documentation details each of the object methods.
116 Internal methods are usually preceded with an underscore (_).
118 =cut
121 # Let the code begin...
124 package Bio::SearchIO::gmap_f9;
125 use strict;
126 use warnings;
128 use Bio::Search::Hit::GenericHit;
129 use Bio::Search::HSP::GenericHSP;
131 use base qw(Bio::SearchIO );
133 use Data::Dumper;
135 =head2 next_result
137 Title : next_result
138 Usage : $result = stream->next_result
139 Function: Reads the next ResultI object from the stream and returns it.
140 Returns : A Bio::Search::Result::ResultI object
141 Args : n/a
143 =cut
145 sub next_result {
146 my $self = shift;
148 my $info = [];
149 my $result;
150 my $hit;
151 my @hsp_info;
152 my $previous_hit_pos;
154 while ( $_ = $self->_readline ) {
155 if ( $_ =~ /^>/ ) { # looking at the start of a result
156 if ($result) { # and done if there's one in progress
157 $self->_pushback($_);
158 goto DONE;
160 else { # otherwise start a new one.
161 my ($id, $desc, $md5) = m|>([^ ]*)\s*(.*)\s*(?:md5:(.*))?|;
163 $result = Bio::Search::Result::GenericResult->new();
164 $result->algorithm('gmap');
165 $result->query_name($id);
166 $result->query_accession($id);
167 $result->query_description($desc);
168 #$self->warn("Take care of MD5!\n");
170 $hit ||= Bio::Search::Hit::GenericHit->new( -name =>
171 "NONE_SPECIFIED");
174 else { # add another position to the hit/hsp
175 # 468 H 1956 C -14:104307764 2298317517 C H
176 # 468 1957 A -14:104307763 2298317516 A
177 my $c; # info about a column
178 ($c->{query_aa_pos}, $c->{query_aa}, $c->{query_pos},
179 $c->{query_base},
180 $c->{hit_strand}, $c->{hit_chromo}, $c->{hit_pos},
181 $c->{hit_concat_pos}, $c->{hit_base}, $c->{hit_aa})
182 = ($_ =~
184 (\d+)[ ]?(.)?[\t]
185 (\d+)[ ]?(.)?[\t]
186 # TODO chromosome isn't a number... X, Y, MT....
187 (\+\|\-)([\dxXyY]+\|MT):(\d+)[ ](\d+)[ ](.)
188 [\t]?(.)?
192 if ($previous_hit_pos &&
193 (abs($c->{hit_pos} - $previous_hit_pos) > 1)) {
194 $hit ||= Bio::Search::Hit::GenericHit->new( -name =>
195 "NONE_SPECIFIED",
197 $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) );
198 @hsp_info = ();
200 push @hsp_info, $c;
201 $previous_hit_pos = $c->{hit_pos};
205 DONE:
206 if ($result) {
207 $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ) if (@hsp_info);
209 my ($hit_length,$query_length);
210 for my $hsp ($hit->hsps) {
211 $hit_length += $hsp->length();
212 $query_length += $hsp->length('query');
214 $hit->length($hit_length);
215 $hit->query_length($query_length);
216 # update this now that we actually know something useful.q
217 $hit->name($hsp_info[0]->{hit_chromo});
219 $result->add_hit($hit) if ($hit);
222 return($result);
227 sub _hsp_from_info {
228 my $self = shift;
229 my $info = shift;
230 my $a = {}; # args w/ which we'll create hsp
231 my $hsp;
232 my $identical;
234 $a->{-algorithm} = 'GMAP';
236 for my $c (@{$info}) {
237 $a->{-query_seq} .= $c->{query_base};
238 $a->{-hit_seq} .= $c->{hit_base};
239 $a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' ';
240 $identical++ if ( $c->{query_base} eq $c->{hit_base} );
243 $a->{-query_seq} =~ s| |\-|g; # switch to bioperl gaps.
244 $a->{-hit_seq} =~ s| |\-|g;
246 $a->{-conserved} = $a->{-identical} = $identical;
248 # use the coordinates from from gmap's -f 9 output to
249 # determine whether gmap revcomped the query sequence
250 # to generate the alignment. Note that this is not
251 # the same as the cDNA's sense/anti-sense-ness.
252 $a->{-stranded} = 'both';
254 $a->{-query_start} = $info->[0]->{query_pos};
255 $a->{-query_end} = $info->[-1]->{query_pos};
256 $a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1;
258 # hit can be either strand, -f 9 output tells us which.
259 # we don't have to worry about it here, but telling the generichsp code
260 # that this hit is 'stranded', it compares the start and end positions
261 # sets it for us.
262 $a->{-hit_start} = $info->[0]->{hit_pos};
263 $a->{-hit_end} = $info->[-1]->{hit_pos};
265 $a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1;
267 $a->{-hsp_length} =
268 $a->{-query_length} > $a->{-hit_length} ?
269 $a->{-query_length} : $a->{-hit_length};
271 $hsp = Bio::Search::HSP::GenericHSP->new( %$a );
273 return $hsp;
276 # TODO (adjust regexp to swallow lines w/out md5 sig's.
277 sub _parse_path_header {
278 my $self = shift;
279 my $path_line = shift;
280 my $path = {};
283 $path->{query},
284 $path->{db},
285 $path->{path_num},
286 $path->{path_total_num},
287 $path->{query_length},
288 $path->{exon_count},
289 $path->{trimmed_coverage},
290 $path->{percent_identity},
291 $path->{query_start},
292 $path->{query_end},
293 $path->{whole_genome_start},
294 $path->{whole_genome_end},
295 $path->{chromosome},
296 $path->{chromo_start},
297 $path->{chromo_end},
298 $path->{strand},
299 $path->{sense},
300 $path->{md5},
302 ($_ =~ qr|
304 ([^ ]*)[ ] # the query id}, followed by a space
305 ([^ ]*)[ ] # the genome database, followed by a space
306 (\d+)/(\d+)[ ] # path_num/path_total_num (e.g. 3/12)
307 (\d+)[ ] # query length, followed by a space
308 (\d+)[ ] # hsp/exon count, followed by a space
309 (\d+\.\d*)[ ] # trimmed coverage
310 (\d+\.\d*)[ ] # percent identity
311 (\d+)\.\.(\d+)[ ] # query start .. query end, followed by space
312 (\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space
313 (\d+): # chromosome number
314 (\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space
315 ([+-])[ ] # strand, followed by a space
316 dir:(.*) # dir:sense or dir:antisense
317 [ ]md5:([\dabcdefg]+) # md5 signature
321 $path->{query} or $self->throw("query was not found in path line.");
322 $path->{db} or $self->throw("db was not found in path line.");
323 $path->{path_num} or $self->throw("path_num was not found in path line.");
324 $path->{path_total_num} or
325 $self->throw("path_total_num was not found in path line.");
326 $path->{query_length} or
327 $self->throw("query_length was not found in path line.");
328 $path->{exon_count} or
329 $self->throw("exon_count was not found in path line.");
330 $path->{trimmed_coverage} or
331 $self->throw("trimmed_coverage was not found in path line.");
332 $path->{percent_identity} or
333 $self->throw("percent_identity was not found in path line.");
334 $path->{query_start} or
335 $self->throw("query_start was not found in path line.");
336 $path->{query_end} or
337 $self->throw("query_end was not found in path line.");
338 $path->{whole_genome_start} or
339 $self->throw("whole_genome_start was not found in path line.");
340 $path->{whole_genome_end} or
341 $self->throw("whole_genome_end was not found in path line.");
342 $path->{chromosome} or
343 $self->throw("chromosome was not found in path line.");
344 $path->{chromo_start} or
345 $self->throw("chromo_start was not found in path line.");
346 $path->{chromo_end} or
347 $self->throw("chromo_end was not found in path line.");
348 $path->{strand} or $self->throw("strand was not found in path line.");
349 $path->{sense} or $self->throw("sense was not found in path line.");
351 return $path;
354 sub _parse_alignment_line {
355 my $self = shift;
356 my $a_line = shift;
357 my $align = {};
360 $align->{chromo_start},
361 $align->{chromo_end},
362 $align->{query_start},
363 $align->{query_end},
364 $align->{percent_identity},
365 $align->{align_length},
366 $align->{intron_length},
368 ($_ =~ qr|
369 [\t]
370 ([\d]+)[ ] # start in chromosome coord.
371 ([\d]+)[ ] # end in chromosome coord.
372 ([\d]+)[ ] # start in query coord.
373 ([\d]+)[ ] # end in query coord.
374 ([\d]+) # percent identity (as integer)
375 [\t].*[\t] # skip the edit script
376 ([\d]+) # length of alignment block.
377 [\t]*([\d]+)* # length of following intron.
381 $align->{chromo_start}
382 or $self->throw("chromo_start missing in alignment line.");
383 $align->{chromo_end},
384 or $self->throw("chromo_end was missing in alignment line.");
385 $align->{query_start},
386 or $self->throw("query_start was missing in alignment line.");
387 $align->{query_end},
388 or $self->throw("query_end was missing in alignment line.");
389 $align->{percent_identity},
390 or $self->throw("percent_identity was missing in alignment line.");
391 $align->{align_length},
392 or $self->throw("align_length was missing in alignment line.");
394 return $align;