Bio/SearchIO/gmap_f9.pm

   1 # $Id: gmap_f9.pm 15987 2009-08-18 21:08:55Z lstein $
   2 #
   3 # BioPerl module for Bio::SearchIO::gmap_f9
   4 #
   5 # Cared for by George Hartzell <hartzell@alerce.com>
   6 #
   7 # Copyright George Hartzell
   8 #
   9 # You may distribute this module under the same terms as perl itself
  10
  11 # POD documentation - main docs before the code
  12
  13 =head1 NAME
  14
  15 Bio::SearchIO::gmap_f9 - Event generator for parsing gmap reports (Z format)
  16
  17 =head1 SYNOPSIS
  18
  19    # Do not use this object directly - it is used as part of the
  20    # Bio::SearchIO system.
  21
  22     use Bio::SearchIO;
  23     my $searchio = Bio::SearchIO->new(-format => 'gmap',
  24                                       -file   => 't/data/her2.gmapz');
  25     while( my $result = $searchio->next_result ) {
  26         while( my $hit = $result->next_hit ) {
  27             while( my $hsp = $hit->next_hsp ) {
  28                 # ...
  29             }
  30         }
  31     }
  32
  33
  34 =head1 DESCRIPTION
  35
  36 This object encapsulated the necessary methods for generating events
  37 suitable for building Bio::Search objects from a GMAP "compressed"
  38 report (from gmap run with -Z flag) Read the L<Bio::SearchIO> for more
  39 information about how to use this.
  40
  41 =head2 REVERSE STRAND AND BIOPERL COORDINATES
  42
  43 I believe that I'm doing the correct thing when reporting hits on the
  44 negative strand of the genome.  In particular, I've compared the
  45 "exons" this code generates with the set returned by ncbi's megablast
  46 web service.  NCBI's hsp's are ordered differently and have a
  47 different genomic location (off by ~18,000,000 bases, padding?) but
  48 the starts, ends, and lengths were similar and my strand handling
  49 matches theirs.  E.g.
  50
  51    CDNA                            GENOME
  52  start  end    strand   start           end             strand
  53
  54 blast
  55   1913  2989    1       86236731        86237808        -1
  56   1     475     1       86260509        86260983        -1
  57   1510  1727    1       86240259        86240476        -1
  58   841   989     1       86243034        86243182        -1
  59   1381  1514    1       86240630        86240763        -1
  60   989   1122    1       86242457        86242590        -1
  61   599   729     1       86247470        86247600        -1
  62   473   608     1       86259972        86260107        -1
  63   1255  1382    1       86240837        86240964        -1
  64   730   842     1       86244040        86244152        -1
  65   1813  1921    1       86238123        86238231        -1
  66   1725  1814    1       86239747        86239836        -1
  67   1167  1256    1       86241294        86241383        -1
  68   1120  1188    1       86242319        86242387        -1
  69
  70 gmap
  71   1     475     1       104330509       104330983       -1
  72   476   600     1       104329980       104330104       -1
  73   601   729     1       104317470       104317598       -1
  74   730   841     1       104314041       104314152       -1
  75   842   989     1       104313034       104313181       -1
  76   990   1121    1       104312458       104312589       -1
  77   1122  1187    1       104312320       104312385       -1
  78   1188  1256    1       104311294       104311362       -1
  79   1257  1382    1       104310837       104310962       -1
  80   1383  1511    1       104310633       104310761       -1
  81   1512  1726    1       104310260       104310474       -1
  82   1727  1814    1       104309747       104309834       -1
  83   1815  1917    1       104308127       104308229       -1
  84   1918  2989    1       104306731       104307802       -1
  85
  86 =head1 FEEDBACK
  87
  88 =head2 Mailing Lists
  89
  90 User feedback is an integral part of the evolution of this and other
  91 Bioperl modules. Send your comments and suggestions preferably to
  92 the Bioperl mailing list.  Your participation is much appreciated.
  93
  94   bioperl-l@bioperl.org                  - General discussion
  95   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
  96
  97 =head2 Reporting Bugs
  98
  99 Report bugs to the Bioperl bug tracking system to help us keep track
 100 of the bugs and their resolution. Bug reports can be submitted via
 101 the web:
 102
 103   https://github.com/bioperl/bioperl-live/issues
 104
 105 =head1 AUTHOR - George Hartzell
 106
 107 Email hartzell@alerce.com
 108
 109 =head1 CONTRIBUTORS
 110
 111 Additional contributors names and emails here
 112
 113 =head1 APPENDIX
 114
 115 The rest of the documentation details each of the object methods.
 116 Internal methods are usually preceded with an underscore (_).
 117
 118 =cut
 119
 120
 121 # Let the code begin...
 122
 123
 124 package Bio::SearchIO::gmap_f9;
 125 use strict;
 126 use warnings;
 127
 128 use Bio::Search::Hit::GenericHit;
 129 use Bio::Search::HSP::GenericHSP;
 130
 131 use base qw(Bio::SearchIO );
 132
 133 use Data::Dumper;
 134
 135 =head2 next_result
 136
 137  Title   : next_result
 138  Usage   : $result = stream->next_result
 139  Function: Reads the next ResultI object from the stream and returns it.
 140  Returns : A Bio::Search::Result::ResultI object
 141  Args    : n/a
 142
 143 =cut
 144
 145 sub next_result {
 146   my $self = shift;
 147
 148   my $info = [];
 149   my $result;
 150   my $hit;
 151   my @hsp_info;
 152   my $previous_hit_pos;
 153
 154   while ( $_ = $self->_readline ) {
 155       if ( $_ =~ /^>/ ) {       # looking at the start of a result
 156           if ($result) {        #    and done if there's one in progress
 157               $self->_pushback($_);
 158               goto DONE;
 159           }
 160           else {                #    otherwise start a new one.
 161               my ($id, $desc, $md5) = m|>([^ ]*)\s*(.*)\s*(?:md5:(.*))?|;
 162
 163               $result = Bio::Search::Result::GenericResult->new();
 164               $result->algorithm('gmap');
 165               $result->query_name($id);
 166               $result->query_accession($id);
 167               $result->query_description($desc);
 168               #$self->warn("Take care of MD5!\n");
 169
 170               $hit ||= Bio::Search::Hit::GenericHit->new( -name =>
 171                                                           "NONE_SPECIFIED");
 172           }
 173       }
 174       else {                    # add another position to the hit/hsp
 175           # 468 H       1956 C  -14:104307764 2298317517 C      H
 176           # 468         1957 A  -14:104307763 2298317516 A
 177           my $c;                # info about a column
 178           ($c->{query_aa_pos}, $c->{query_aa}, $c->{query_pos},
 179            $c->{query_base},
 180            $c->{hit_strand}, $c->{hit_chromo}, $c->{hit_pos},
 181            $c->{hit_concat_pos}, $c->{hit_base}, $c->{hit_aa})
 182               = ($_ =~
 183                  m|
 184                    (\d+)[ ]?(.)?[\t]
 185                    (\d+)[ ]?(.)?[\t]
 186                    # TODO chromosome isn't a number... X, Y, MT....
 187                    (\+\|\-)([\dxXyY]+\|MT):(\d+)[ ](\d+)[ ](.)
 188                    [\t]?(.)?
 189                   |xo
 190                 );
 191
 192           if ($previous_hit_pos &&
 193               (abs($c->{hit_pos} - $previous_hit_pos) > 1)) {
 194               $hit ||= Bio::Search::Hit::GenericHit->new( -name =>
 195                                                           "NONE_SPECIFIED",
 196                                                         );
 197               $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) );
 198               @hsp_info = ();
 199           }
 200           push @hsp_info, $c;
 201           $previous_hit_pos = $c->{hit_pos};
 202       }
 203   }
 204
 205  DONE:
 206   if ($result) {
 207       $hit->add_hsp( $self->_hsp_from_info(\@hsp_info) ) if (@hsp_info);
 208
 209       my ($hit_length,$query_length);
 210       for my $hsp ($hit->hsps) {
 211           $hit_length += $hsp->length();
 212       $query_length += $hsp->length('query');
 213       }
 214       $hit->length($hit_length);
 215       $hit->query_length($query_length);
 216       # update this now that we actually know something useful.q
 217       $hit->name($hsp_info[0]->{hit_chromo});
 218
 219       $result->add_hit($hit) if ($hit);
 220   }
 221
 222   return($result);
 223 }
 224
 225
 226
 227 sub _hsp_from_info {
 228     my $self = shift;
 229     my $info = shift;
 230     my $a = {};                 # args w/ which we'll create hsp
 231     my $hsp;
 232     my $identical;
 233
 234     $a->{-algorithm} = 'GMAP';
 235
 236     for my $c (@{$info}) {
 237         $a->{-query_seq} .= $c->{query_base};
 238         $a->{-hit_seq} .= $c->{hit_base};
 239     $a->{-homology_seq} .= $c->{query_base} eq $c->{hit_base} ? $c->{hit_base} : ' ';
 240         $identical++ if ( $c->{query_base} eq $c->{hit_base} );
 241     }
 242
 243     $a->{-query_seq} =~ s| |\-|g;               # switch to bioperl gaps.
 244     $a->{-hit_seq} =~ s| |\-|g;
 245
 246     $a->{-conserved} = $a->{-identical} = $identical;
 247
 248     # use the coordinates from from gmap's -f 9 output to
 249     # determine whether gmap revcomped the query sequence
 250     # to generate the alignment.  Note that this is not
 251     # the same as the cDNA's sense/anti-sense-ness.
 252     $a->{-stranded} = 'both';
 253
 254     $a->{-query_start} = $info->[0]->{query_pos};
 255     $a->{-query_end} = $info->[-1]->{query_pos};
 256     $a->{-query_length} = $a->{-query_end} - $a->{-query_start} + 1;
 257
 258     # hit can be either strand, -f 9 output tells us which.
 259     # we don't have to worry about it here, but telling the generichsp code
 260     # that this hit is 'stranded', it compares the start and end positions
 261     # sets it for us.
 262     $a->{-hit_start} = $info->[0]->{hit_pos};
 263     $a->{-hit_end} = $info->[-1]->{hit_pos};
 264
 265     $a->{-hit_length} = abs($a->{-hit_end} - $a->{-hit_start}) + 1;
 266
 267     $a->{-hsp_length} =
 268         $a->{-query_length} > $a->{-hit_length} ?
 269             $a->{-query_length} : $a->{-hit_length};
 270
 271     $hsp = Bio::Search::HSP::GenericHSP->new( %$a );
 272
 273     return $hsp;
 274 }
 275
 276 # TODO (adjust regexp to swallow lines w/out md5 sig's.
 277 sub _parse_path_header {
 278     my $self = shift;
 279     my $path_line = shift;
 280     my $path = {};
 281
 282     (
 283      $path->{query},
 284      $path->{db},
 285      $path->{path_num},
 286      $path->{path_total_num},
 287      $path->{query_length},
 288      $path->{exon_count},
 289      $path->{trimmed_coverage},
 290      $path->{percent_identity},
 291      $path->{query_start},
 292      $path->{query_end},
 293      $path->{whole_genome_start},
 294      $path->{whole_genome_end},
 295      $path->{chromosome},
 296      $path->{chromo_start},
 297      $path->{chromo_end},
 298      $path->{strand},
 299      $path->{sense},
 300      $path->{md5},
 301     ) =
 302         ($_ =~ qr|
 303                 >
 304                 ([^ ]*)[ ]      # the query id}, followed by a space
 305                 ([^ ]*)[ ]      # the genome database, followed by a space
 306                 (\d+)/(\d+)[ ]  # path_num/path_total_num (e.g. 3/12)
 307                 (\d+)[ ]        # query length, followed by a space
 308                 (\d+)[ ]        # hsp/exon count, followed by a space
 309                 (\d+\.\d*)[ ]   # trimmed coverage
 310                 (\d+\.\d*)[ ]   # percent identity
 311                 (\d+)\.\.(\d+)[ ] # query start .. query end, followed by space
 312                 (\d+)\.\.(\d+)[ ] # whole genome s..e, followed by space
 313                 (\d+):          # chromosome number
 314                 (\d+)\.\.(\d+)[ ] # chromo s..e, followed by a space
 315                 ([+-])[ ]       # strand, followed by a space
 316                 dir:(.*) # dir:sense or dir:antisense
 317                 [ ]md5:([\dabcdefg]+) # md5 signature
 318          |x
 319         );
 320
 321     $path->{query} or $self->throw("query was not found in path line.");
 322     $path->{db} or $self->throw("db was not found in path line.");
 323     $path->{path_num} or $self->throw("path_num was not found in path line.");
 324     $path->{path_total_num} or
 325         $self->throw("path_total_num was not found in path line.");
 326     $path->{query_length} or
 327         $self->throw("query_length was not found in path line.");
 328     $path->{exon_count} or
 329         $self->throw("exon_count was not found in path line.");
 330     $path->{trimmed_coverage} or
 331         $self->throw("trimmed_coverage was not found in path line.");
 332     $path->{percent_identity} or
 333         $self->throw("percent_identity was not found in path line.");
 334     $path->{query_start} or
 335         $self->throw("query_start was not found in path line.");
 336     $path->{query_end} or
 337         $self->throw("query_end was not found in path line.");
 338     $path->{whole_genome_start} or
 339         $self->throw("whole_genome_start was not found in path line.");
 340     $path->{whole_genome_end} or
 341         $self->throw("whole_genome_end was not found in path line.");
 342     $path->{chromosome} or
 343         $self->throw("chromosome was not found in path line.");
 344     $path->{chromo_start} or
 345         $self->throw("chromo_start was not found in path line.");
 346     $path->{chromo_end} or
 347         $self->throw("chromo_end was not found in path line.");
 348     $path->{strand} or $self->throw("strand was not found in path line.");
 349     $path->{sense} or $self->throw("sense was not found in path line.");
 350
 351     return $path;
 352 }
 353
 354 sub _parse_alignment_line {
 355     my $self = shift;
 356     my $a_line = shift;
 357     my $align = {};
 358
 359     (
 360      $align->{chromo_start},
 361      $align->{chromo_end},
 362      $align->{query_start},
 363      $align->{query_end},
 364      $align->{percent_identity},
 365      $align->{align_length},
 366      $align->{intron_length},
 367     ) =
 368         ($_ =~ qr|
 369                 [\t]
 370                 ([\d]+)[ ]      # start in chromosome coord.
 371                 ([\d]+)[ ]      # end in chromosome coord.
 372                 ([\d]+)[ ]      # start in query coord.
 373                 ([\d]+)[ ]      # end in query coord.
 374                 ([\d]+)         # percent identity (as integer)
 375                 [\t].*[\t]      # skip the edit script
 376                 ([\d]+)         # length of alignment block.
 377                 [\t]*([\d]+)*   # length of following intron.
 378          |x
 379         );
 380
 381      $align->{chromo_start}
 382         or $self->throw("chromo_start missing in alignment line.");
 383      $align->{chromo_end},
 384         or $self->throw("chromo_end was missing in alignment line.");
 385      $align->{query_start},
 386         or $self->throw("query_start was missing in alignment line.");
 387      $align->{query_end},
 388         or $self->throw("query_end was missing in alignment line.");
 389      $align->{percent_identity},
 390         or $self->throw("percent_identity was missing in alignment line.");
 391      $align->{align_length},
 392         or $self->throw("align_length was missing in alignment line.");
 393
 394     return $align;
 395 }
 396
 397 1;