lib/SGN/View/Feature.pm

   1 package SGN::View::Feature;
   2 use strict;
   3 use warnings;
   4
   5 use base 'Exporter';
   6
   7 use HTML::Entities;
   8 use List::Util qw/ sum /;
   9 use List::MoreUtils qw/ any uniq /;
  10
  11 use Bio::Seq;
  12
  13 use CXGN::Tools::Text qw/commify_number/;
  14 use CXGN::Tools::Identifiers;
  15
  16
  17 our @EXPORT_OK = qw/
  18     related_stats feature_table
  19     feature_link
  20     cvterm_link
  21     organism_link feature_length
  22     mrna_cds_protein_sequence
  23     description_featureprop_types
  24     get_descriptions
  25     location_list_html
  26     location_list
  27     location_string
  28     location_string_html
  29     type_name
  30 /;
  31
  32 sub type_name {
  33     cvterm_name( shift->type, @_ );
  34 }
  35
  36 sub cvterm_name {
  37     my ($cvt, $caps) = @_;
  38     ( my $n = $cvt->name ) =~ s/_/ /g;
  39     if( $caps ) {
  40         $n =~ s/(\S+)/lc($1) eq $1 ? ucfirst($1) : $1/e;
  41     }
  42     return $n;
  43 }
  44
  45 sub description_featureprop_types {
  46     shift->result_source->schema
  47          ->resultset('Cv::Cvterm')
  48          ->search({
  49              name => [ 'Note',
  50                        'functional_description',
  51                        'Description',
  52                        'description',
  53                      ],
  54            })
  55 }
  56
  57 sub get_descriptions {
  58     my ( $feature, $plain ) = @_;
  59
  60     my $desc_types =
  61         description_featureprop_types( $feature )
  62             ->get_column('cvterm_id')
  63             ->as_query;
  64
  65     my @descriptions =
  66         $feature->search_related('featureprops',
  67                                  { type_id => { -in => $desc_types } },
  68                                  { order_by => 'rank' },
  69                                 )
  70                 ->get_column('value')
  71                 ->all;
  72
  73
  74     return @descriptions if defined $plain;
  75
  76     s/(\S+)/my $id = $1; CXGN::Tools::Identifiers::link_identifier($id) || $id/ge for @descriptions;
  77
  78     return @descriptions;
  79 }
  80
  81 sub location_string_html {
  82     my ( $id, $start, $end, $strand ) = @_;
  83     if( @_ == 1 ) {
  84         my $loc = shift;
  85         $id     = feature_link($loc->srcfeature);
  86         $start  = $loc->fmin+1;
  87         $end    = $loc->fmax;
  88         $strand = $loc->strand;
  89     }
  90     ( $start, $end ) = ( $end, $start ) if $strand && $strand == -1;
  91     return "$id:$start..$end";
  92 }
  93
  94 sub location_string {
  95     my ( $id, $start, $end, $strand ) = @_;
  96     if( @_ == 1 ) {
  97         my $loc = shift;
  98         $id     = $loc->srcfeature->name;
  99         $start  = $loc->fmin+1;
 100         $end    = $loc->fmax;
 101         $strand = $loc->strand;
 102     }
 103     ( $start, $end ) = ( $end, $start ) if $strand && $strand == -1;
 104     return "$id:$start..$end";
 105 }
 106
 107 sub location_list_html {
 108     my ($feature, $featurelocs) = @_;
 109     my @coords = map { location_string_html($_) }
 110         (  #$featurelocs ? $featurelocs->all
 111           #             : $feature->featureloc_features->all );
 112           $featurelocs ? $featurelocs->search({locgroup => 0,},)->all
 113                        : $feature->featureloc_features->search({locgroup => 0,},)->all)
 114         or return '<span class="ghosted">none</span>';
 115     return @coords;
 116 }
 117 sub location_list {
 118     my ($feature, $featurelocs) = @_;
 119     print STDERR "\n\nLOCATON LIST\n\n";
 120     return map { ($_->srcfeature ? $_->srcfeature->name : '<span class="ghosted">null</span>') . ':' . ($_->fmin+1) . '..' . $_->fmax }
 121         ( #$featurelocs ? $featurelocs->all
 122           #             : $feature->featureloc_features->all );
 123     $featurelocs ? $featurelocs->search({locgroup => 0,},)->all
 124                        : $feature->featureloc_features->search({locgroup => 0,},)->all );
 125 }
 126
 127 sub related_stats {
 128     my ($features) = @_;
 129     my $stats = { };
 130     my $total = scalar @$features;
 131     for my $f (@$features) {
 132             $stats->{cvterm_link($f->type)}++;
 133     }
 134     my $data = [ ];
 135     for my $k (sort keys %$stats) {
 136         push @$data, [ $stats->{$k}, $k ];
 137     }
 138     if( 1 < scalar keys %$stats ) {
 139         push @$data, [ $total, "<b>Total</b>" ];
 140     }
 141     return $data;
 142 }
 143
 144 sub feature_table {
 145     my ( $features, $reference_sequence, $omit_columns ) = @_;
 146
 147     { no warnings 'uninitialized';
 148       $omit_columns ||= [];
 149       $omit_columns = [$omit_columns] unless ref $omit_columns eq 'ARRAY';
 150     }
 151
 152     my @data;
 153     for my $f (sort { $a->name cmp $b->name } @$features) {
 154         my @ref_condition =
 155             $reference_sequence ? ( srcfeature_id => $reference_sequence->feature_id )
 156                                 : ();
 157
 158         my @locations = $f->search_related('featureloc_features', {
 159             @ref_condition,
 160             locgroup => 0,
 161            },
 162            { order_by => 'feature_id' }
 163           );
 164
 165         if( @locations ) {
 166         # Add a row for every featureloc
 167             my $first_location = 0;
 168             for my $loc (@locations) {
 169                 my $ref = $loc->srcfeature;
 170                 my ($start,$end) = ($loc->fmin+1, $loc->fmax);
 171                 push @data, [
 172                     ( $first_location++
 173                           ? ('','','')
 174                           : ( organism_link( $f->organism ),
 175                               cvterm_link($f->type),
 176                               feature_link($f),
 177                             )
 178                     ),
 179                     ($ref ? $ref->name : '<span class="ghosted">null</span>').":$start..$end",
 180                     commify_number( feature_length( $f, $loc ) ) || undef,
 181                     $loc->strand ? ( $loc->strand == 1 ? '+' : '-' ) : undef,
 182                     $loc->phase || undef,
 183                     ];
 184             }
 185         }
 186         else {
 187             my $nl = 'not located';
 188             if( $reference_sequence ) {
 189                 $nl .= " on ".encode_entities( $reference_sequence->name )
 190             }
 191             push @data, [
 192                 organism_link( $f->organism ),
 193                 cvterm_link($f->type),
 194                 feature_link($f),
 195                 qq|<span class="ghosted">$nl</span>|,
 196                 commify_number( feature_length( $f, undef ) ) || undef,
 197                 undef,
 198                 undef,
 199             ];
 200         }
 201     }
 202
 203     my @headings = ( "Organism", "Type", "Name", "Location", "Length", "Strand", "Phase" );
 204
 205     my @align = map 'l', @headings;
 206
 207     # omit any columns that are *all* undefined, or that we were
 208     # requested to omit
 209     my @cols_to_omit = uniq(
 210         do {
 211             my %heading_index = do { my $i = 0; map { lc $_ => $i++ } @headings };
 212             (map {
 213                 my $i = $heading_index{lc $_};
 214                 defined $i or die "$_ column not found";
 215                 $i
 216              } @$omit_columns
 217             )
 218         },
 219       );
 220     for my $t ( [\@headings], \@data, [\@align] ) {
 221         for my $row ( @$t ) {
 222             splice( @$row, $_, 1 ) for @cols_to_omit;
 223         }
 224     }
 225
 226     # make html for any other undef cells
 227     for (@data) {
 228         for (@$_) {
 229             $_ = '<span class="ghosted">n/a</span>' unless defined;
 230         }
 231     }
 232
 233     return ( headings => \@headings, data => \@data, __align => \@align, __alt_freq => 0 , __border => 1 );
 234 }
 235
 236 # try to figure out the "length" of a feature, which will vary for different features
 237 sub feature_length {
 238     my ( $feature, $location ) = @_;
 239
 240     $location = $location->first
 241         if $location && $location->isa('DBIx::Class::ResultSet');
 242
 243     my $type      = $feature->type;
 244     my $type_name = $type->name;
 245
 246     # firstly, for any feature, trust the length of its residues if it has them
 247     if( my $seqlen = $feature->seqlen || $feature->residues && length $feature->residues ) {
 248         return $seqlen;
 249     }
 250     # for some features, can say that its length is the length of its location
 251     elsif( any { $type_name eq $_ } qw( exon gene ) ) {
 252         return unless $location;
 253         return $location->fmax - $location->fmin;
 254     }
 255     return;
 256 }
 257
 258 sub _feature_search_string {
 259     my ($fl) = @_;
 260     return '' unless $fl;
 261     return ($fl->srcfeature ? $fl->srcfeature->name : '<span class="ghosted">null</span>') . ':'. ($fl->fmin+1) . '..' . $fl->fmax;
 262 }
 263
 264
 265 ### XXX TODO: A lot of these _link and sequence functions need to be
 266 ### moved to controller code.
 267
 268 sub feature_link {
 269     my ($feature) = @_;
 270     return '<span class="ghosted">null</span>' unless $feature;
 271     my $id   = $feature->feature_id;
 272     my $name = $feature->name;
 273     return qq{<a href="/feature/$id/details">$name</a>};
 274 }
 275
 276 sub organism_link {
 277     my ($organism) = @_;
 278     my $id      = $organism->organism_id;
 279     my $species = $organism->species;
 280     return qq{<a class="species_binomial" href="/chado/organism.pl?organism_id=$id">$species</a>};
 281 }
 282
 283 sub cvterm_link {
 284     my ( $cvt, $caps ) = @_;
 285     my $name = cvterm_name( $cvt, $caps );
 286     my $id   = $cvt->id;
 287     return qq{<a href="/cvterm/$id/view">$name</a>};
 288 }
 289
 290 sub mrna_cds_protein_sequence {
 291     my ($mrna_feature) = @_;
 292
 293     # if we were actually passed a polypeptide, get its mrna(s) and
 294     # recurse
 295     if( $mrna_feature->type->name eq 'polypeptide' ) {
 296         return
 297             map mrna_cds_protein_sequence( $_ ),
 298             $mrna_feature->search_related('feature_relationship_subjects',
 299                     { 'me.type_id' => {
 300                         -in => $mrna_feature->result_source->schema
 301                                             ->resultset('Cv::Cvterm')
 302                                             ->search({name => 'derives_from'})
 303                                             ->get_column('cvterm_id')
 304                                             ->as_query,
 305                     },
 306                   },
 307                )
 308                ->search_related('object');
 309     }
 310
 311     my $description = join ', ', get_descriptions( $mrna_feature, 'no html' );
 312     my $peptide     = _peptides_rs( $mrna_feature )->first;
 313
 314     my @exon_locations = _exon_rs( $mrna_feature )->all;
 315     unless( @exon_locations ) {
 316         # cannot calculate the cds and protein without exons, because
 317         # UTRs can sometimes have introns in them.  without knowing
 318         # the exon structure, we don't know how much to cut off of the
 319         # UTRs
 320         return [
 321             $mrna_feature->subseq(1,1)        ? $mrna_feature : undef,
 322             undef,
 323             $peptide && $peptide->subseq(1,1) ? $peptide      : undef,
 324         ];
 325     }
 326
 327     my $mrna_seq = $mrna_feature->subseq(1,1) ? $mrna_feature : _make_mrna_seq( $mrna_feature, $description, \@exon_locations );
 328     my $peptide_loc = $peptide && _peptide_loc($peptide)->first;
 329
 330     # just return the mrna seq and nothing else if we have no peptide
 331     # or the peptide is not located
 332     unless( $peptide && $peptide_loc ) {
 333         return [ $mrna_seq, undef, undef ] unless $peptide && $peptide_loc;
 334     }
 335
 336     my $cds_seq = Bio::PrimarySeq->new(
 337         -id   => $mrna_seq->display_name,
 338         -desc => $description,
 339         -seq  => $mrna_seq->seq,
 340      );
 341     my ( $trim_from_left, $trim_from_right ) = _calculate_cdna_utr_lengths(
 342         _loc2range( $peptide_loc ),
 343         [ map _loc2range( $_), @exon_locations ],
 344      );
 345
 346     if( $trim_from_left || $trim_from_right ) {
 347         $cds_seq = $cds_seq->trunc( 1+$trim_from_left, $mrna_seq->length - $trim_from_right );
 348     }
 349     ##my $protein_seq = $cds_seq->translate;
 350     ##Get the protein sequence from the peptide object (stored in the database in the residues field of the feature table)
 351     ## No need to try to translade the CDS
 352     my $protein_seq  = Bio::PrimarySeq->new(
 353         -id   => $mrna_seq->display_name,
 354         -desc => $description,
 355         -seq  => $peptide->residues,
 356      );
 357
 358     return [ $mrna_seq, $cds_seq, $protein_seq ];
 359 }
 360
 361 sub _make_mrna_seq {
 362     my ( $mrna_feat, $description, $exons ) = @_;
 363
 364     # NOTE: doing this subseq math in 0-based coords
 365     my $span_start  = $exons->[0]->fmin;
 366     my $span_end    = $exons->[-1]->fmax-1;
 367
 368     # 0 1 2 3 4 5 6 7 8  interbase (Chado)
 369     #  G|C|C|A|T|G|T|A
 370     #  0 1 2 3 4 5 6 7   0-based   (substr)
 371     #  1 2 3 4 5 6 7 8   1-based   (BioPerl)
 372
 373     # recall: the exons are in sorted order
 374     my $span_seq = $exons->[0]->srcfeature->subseq( $span_start+1, $span_end+1 ); #< 1-based
 375     my $mrna_sequence = join '', map { substr($span_seq, $_->fmin - $span_start, $_->fmax - $_->fmin ) } @$exons;
 376
 377     my $mrna_seq = Bio::PrimarySeq->new(
 378         -id   => $mrna_feat->name,
 379         -desc => $description,
 380         -seq  => $mrna_sequence,
 381     );
 382
 383     $mrna_seq = $mrna_seq->revcom if $exons->[0]->strand == -1;
 384
 385     return $mrna_seq;
 386 }
 387
 388 sub _loc2range {
 389     my ( $loc ) = @_;
 390     return $loc->to_range if $loc->can('to_range');
 391     return Bio::Range->new(
 392         -start  => $loc->fmin + 1,
 393         -end    => $loc->fmax,
 394         -strand => $loc->strand,
 395       );
 396 }
 397
 398 # given the range of the peptide and the ranges of each of the exons
 399 # (as Bio::RangeI's), calculate how many bases should be trimmed off
 400 # of each end of the cDNA (i.e. mRNA) seq to get the CDS seq
 401 sub _calculate_cdna_utr_lengths {
 402     my ( $peptide, $exons ) = @_;
 403
 404     my ( $trim_left, $trim_right ) = ( 0, 0 );
 405
 406     # calculate trim_fmin if necessary
 407     if( $exons->[0]->start < $peptide->start ) {
 408
 409         $trim_left =
 410             sum
 411             map {
 412                 $_->overlaps($peptide)
 413                     ? $peptide->start - $_->start
 414                     : $_->length
 415             }
 416             grep $_->start < $peptide->start, # find exons that overlap the UTR
 417             @$exons
 418     }
 419
 420     # calculate trim_fmax if necessary
 421     if( $exons->[-1]->end > $peptide->end ) {
 422         $trim_right =
 423             sum
 424             map {
 425                 $_->overlaps($peptide)
 426                     ? $_->end - $peptide->end
 427                     : $_->length
 428             }
 429             grep $_->end > $peptide->end, # find exons that overlap the UTR
 430             @$exons
 431     }
 432
 433     return $exons->[0]->strand == -1 ? ($trim_right, $trim_left) : ( $trim_left, $trim_right );
 434 }
 435
 436 sub _peptides_rs {
 437     my ( $mrna_feature ) = @_;
 438
 439     $mrna_feature
 440         ->feature_relationship_objects({
 441             'me.type_id' => {
 442                 -in => _cvterm_rs( $mrna_feature, 'relationship', 'derives_from' )
 443                          ->get_column('cvterm_id')
 444                          ->as_query,
 445             },
 446           })
 447         ->search_related( 'subject', {
 448             'subject.type_id' => {
 449                 -in => _cvterm_rs( $mrna_feature, 'sequence', 'polypeptide' )
 450                          ->get_column('cvterm_id')
 451                          ->as_query,
 452             },
 453            })
 454     }
 455 sub _peptide_loc {
 456     my ($rs) = @_;
 457     $rs->search_related( 'featureloc_features', {
 458             #srcfeature_id => { -not => undef },
 459              srcfeature_id => { -not => undef }, locgroup => 0
 460           },
 461           { # Don't prefetch srcfeatures, it significantly slows down the query
 462             # prefetch => 'srcfeature',
 463             order_by => 'fmin',
 464           },
 465          );
 466 }
 467
 468 sub _exon_rs {
 469     my ( $mrna_feature ) = @_;
 470
 471     my $rs = $mrna_feature->feature_relationship_objects({
 472             'me.type_id' => {
 473                 -in => _cvterm_rs( $mrna_feature, 'relationship', 'part_of' )
 474                          ->get_column('cvterm_id')
 475                          ->as_query,
 476             },
 477           },
 478           {
 479               prefetch => 'type',
 480           })
 481         ->search_related( 'subject', {
 482             'subject.type_id' => {
 483                 -in => _cvterm_rs( $mrna_feature, 'sequence', 'exon' )
 484                          ->get_column('cvterm_id')
 485                          ->as_query,
 486             },
 487            },
 488            {
 489                prefetch => 'featureloc_features',
 490            })
 491         ->search_related( 'featureloc_features', {
 492             #srcfeature_id => { -not => undef },
 493              srcfeature_id => { -not => undef }, locgroup => 0
 494           },
 495           {
 496             prefetch => 'srcfeature',
 497             order_by => 'fmin',
 498           },
 499          );
 500     return $rs;
 501 }
 502
 503 sub _cvterm_rs {
 504     my ( $row, $cv, $cvt ) = @_;
 505
 506     return $row->result_source->schema
 507                ->resultset('Cv::Cv')
 508                ->search({ 'me.name' => $cv })
 509                ->search_related('cvterms', {
 510                    'cvterms.name' => $cvt,
 511                  });
 512 }
 513
 514 1;