lib/SGN/View/Feature.pm

   1 package SGN::View::Feature;
   2 use strict;
   3 use warnings;
   4
   5 use base 'Exporter';
   6
   7 use HTML::Entities;
   8 use List::Util qw/ sum /;
   9 use List::MoreUtils qw/ any uniq /;
  10
  11 use Bio::Seq;
  12
  13 use CXGN::Tools::Text qw/commify_number/;
  14 use CXGN::Tools::Identifiers;
  15
  16
  17 our @EXPORT_OK = qw/
  18     related_stats feature_table
  19     feature_link
  20     cvterm_link
  21     organism_link feature_length
  22     mrna_cds_protein_sequence
  23     description_featureprop_types
  24     get_descriptions
  25     location_list_html
  26     location_list
  27     location_string
  28     location_string_html
  29     type_name
  30     feature_types
  31     feature_organisms
  32 /;
  33
  34
  35
  36 sub feature_organisms {
  37     my ($schema) = @_;
  38     return [
  39         [ 0, '' ],
  40         map [ $_->organism_id, $_->species ],
  41         $schema
  42              ->resultset('Sequence::Feature')
  43              ->search_related('organism' , {}, {
  44                  select   => [qw[ organism.organism_id species ]],
  45                  distinct => 1,
  46                  order_by => 'species',
  47                })
  48     ];
  49 }
  50
  51 sub feature_types {
  52     my ($schema) = @_;
  53
  54     my $ref = [
  55         map [$_->cvterm_id,$_->name],
  56         $schema
  57     ->resultset('Sequence::Feature')
  58     ->search_related(
  59         'type',
  60         {},
  61         { select => [qw[ cvterm_id type.name ]],
  62           group_by => [qw[ cvterm_id type.name ]],
  63           order_by => 'type.name',
  64         },
  65     )
  66     ];
  67     # add an empty option
  68     unshift @$ref , ['0', ''];
  69     return $ref;
  70 }
  71
  72 sub type_name {
  73     cvterm_name( shift->type, @_ );
  74 }
  75
  76 sub cvterm_name {
  77     my ($cvt, $caps) = @_;
  78     ( my $n = $cvt->name ) =~ s/_/ /g;
  79     if( $caps ) {
  80         $n =~ s/(\S+)/lc($1) eq $1 ? ucfirst($1) : $1/e;
  81     }
  82     return $n;
  83 }
  84
  85 sub description_featureprop_types {
  86     shift->result_source->schema
  87          ->resultset('Cv::Cvterm')
  88          ->search({
  89              name => [ 'Note',
  90                        'functional_description',
  91                        'Description',
  92                        'description',
  93                      ],
  94            })
  95 }
  96
  97 sub get_descriptions {
  98     my ( $feature, $plain ) = @_;
  99
 100     my $desc_types =
 101         description_featureprop_types( $feature )
 102             ->get_column('cvterm_id')
 103             ->as_query;
 104
 105     my @descriptions =
 106         $feature->search_related('featureprops',
 107                                  { type_id => { -in => $desc_types } },
 108                                  { order_by => 'rank' },
 109                                 )
 110                 ->get_column('value')
 111                 ->all;
 112
 113
 114     return @descriptions if defined $plain;
 115
 116     s/(\S+)/my $id = $1; CXGN::Tools::Identifiers::link_identifier($id) || $id/ge for @descriptions;
 117
 118     return @descriptions;
 119 }
 120
 121 sub location_string_html {
 122     my ( $id, $start, $end, $strand ) = @_;
 123     if( @_ == 1 ) {
 124         my $loc = shift;
 125         $id     = feature_link($loc->srcfeature);
 126         $start  = $loc->fmin+1;
 127         $end    = $loc->fmax;
 128         $strand = $loc->strand;
 129     }
 130     ( $start, $end ) = ( $end, $start ) if $strand && $strand == -1;
 131     return "$id:$start..$end";
 132 }
 133
 134 sub location_string {
 135     my ( $id, $start, $end, $strand ) = @_;
 136     if( @_ == 1 ) {
 137         my $loc = shift;
 138         $id     = $loc->srcfeature->name;
 139         $start  = $loc->fmin+1;
 140         $end    = $loc->fmax;
 141         $strand = $loc->strand;
 142     }
 143     ( $start, $end ) = ( $end, $start ) if $strand && $strand == -1;
 144     return "$id:$start..$end";
 145 }
 146
 147 sub location_list_html {
 148     my ($feature, $featurelocs) = @_;
 149     my @coords = map { location_string_html($_) }
 150         (  #$featurelocs ? $featurelocs->all
 151           #             : $feature->featureloc_features->all );
 152           $featurelocs ? $featurelocs->search({locgroup => 0,},)->all
 153                        : $feature->featureloc_features->search({locgroup => 0,},)->all)
 154         or return '<span class="ghosted">none</span>';
 155     return @coords;
 156 }
 157 sub location_list {
 158     my ($feature, $featurelocs) = @_;
 159     print STDERR "\n\nLOCATON LIST\n\n";
 160     return map { ($_->srcfeature ? $_->srcfeature->name : '<span class="ghosted">null</span>') . ':' . ($_->fmin+1) . '..' . $_->fmax }
 161         ( #$featurelocs ? $featurelocs->all
 162           #             : $feature->featureloc_features->all );
 163     $featurelocs ? $featurelocs->search({locgroup => 0,},)->all
 164                        : $feature->featureloc_features->search({locgroup => 0,},)->all );
 165 }
 166
 167 sub related_stats {
 168     my ($features) = @_;
 169     my $stats = { };
 170     my $total = scalar @$features;
 171     for my $f (@$features) {
 172             $stats->{cvterm_link($f->type)}++;
 173     }
 174     my $data = [ ];
 175     for my $k (sort keys %$stats) {
 176         push @$data, [ $stats->{$k}, $k ];
 177     }
 178     if( 1 < scalar keys %$stats ) {
 179         push @$data, [ $total, "<b>Total</b>" ];
 180     }
 181     return $data;
 182 }
 183
 184 sub feature_table {
 185     my ( $features, $reference_sequence, $omit_columns ) = @_;
 186
 187     { no warnings 'uninitialized';
 188       $omit_columns ||= [];
 189       $omit_columns = [$omit_columns] unless ref $omit_columns eq 'ARRAY';
 190     }
 191
 192     my @data;
 193     for my $f (sort { $a->name cmp $b->name } @$features) {
 194         my @ref_condition =
 195             $reference_sequence ? ( srcfeature_id => $reference_sequence->feature_id )
 196                                 : ();
 197
 198         my @locations = $f->search_related('featureloc_features', {
 199             @ref_condition,
 200             locgroup => 0,
 201            },
 202            { order_by => 'feature_id' }
 203           );
 204
 205         if( @locations ) {
 206         # Add a row for every featureloc
 207             my $first_location = 0;
 208             for my $loc (@locations) {
 209                 my $ref = $loc->srcfeature;
 210                 my ($start,$end) = ($loc->fmin+1, $loc->fmax);
 211                 push @data, [
 212                     ( $first_location++
 213                           ? ('','','')
 214                           : ( organism_link( $f->organism ),
 215                               cvterm_link($f->type),
 216                               feature_link($f),
 217                             )
 218                     ),
 219                     ($ref ? $ref->name : '<span class="ghosted">null</span>').":$start..$end",
 220                     commify_number( feature_length( $f, $loc ) ) || undef,
 221                     $loc->strand ? ( $loc->strand == 1 ? '+' : '-' ) : undef,
 222                     $loc->phase || undef,
 223                     ];
 224             }
 225         }
 226         else {
 227             my $nl = 'not located';
 228             if( $reference_sequence ) {
 229                 $nl .= " on ".encode_entities( $reference_sequence->name )
 230             }
 231             push @data, [
 232                 organism_link( $f->organism ),
 233                 cvterm_link($f->type),
 234                 feature_link($f),
 235                 qq|<span class="ghosted">$nl</span>|,
 236                 commify_number( feature_length( $f, undef ) ) || undef,
 237                 undef,
 238                 undef,
 239             ];
 240         }
 241     }
 242
 243     my @headings = ( "Organism", "Type", "Name", "Location", "Length", "Strand", "Phase" );
 244
 245     my @align = map 'l', @headings;
 246
 247     # omit any columns that are *all* undefined, or that we were
 248     # requested to omit
 249     my @cols_to_omit = uniq(
 250         do {
 251             my %heading_index = do { my $i = 0; map { lc $_ => $i++ } @headings };
 252             (map {
 253                 my $i = $heading_index{lc $_};
 254                 defined $i or die "$_ column not found";
 255                 $i
 256              } @$omit_columns
 257             )
 258         },
 259       );
 260     for my $t ( [\@headings], \@data, [\@align] ) {
 261         for my $row ( @$t ) {
 262             splice( @$row, $_, 1 ) for @cols_to_omit;
 263         }
 264     }
 265
 266     # make html for any other undef cells
 267     for (@data) {
 268         for (@$_) {
 269             $_ = '<span class="ghosted">n/a</span>' unless defined;
 270         }
 271     }
 272
 273     return ( headings => \@headings, data => \@data, __align => \@align, __alt_freq => 0 , __border => 1 );
 274 }
 275
 276 # try to figure out the "length" of a feature, which will vary for different features
 277 sub feature_length {
 278     my ( $feature, $location ) = @_;
 279
 280     $location = $location->first
 281         if $location && $location->isa('DBIx::Class::ResultSet');
 282
 283     my $type      = $feature->type;
 284     my $type_name = $type->name;
 285
 286     # firstly, for any feature, trust the length of its residues if it has them
 287     if( my $seqlen = $feature->seqlen || $feature->residues && length $feature->residues ) {
 288         return $seqlen;
 289     }
 290     # for some features, can say that its length is the length of its location
 291     elsif( any { $type_name eq $_ } qw( exon gene ) ) {
 292         return unless $location;
 293         return $location->fmax - $location->fmin;
 294     }
 295     return;
 296 }
 297
 298 sub _feature_search_string {
 299     my ($fl) = @_;
 300     return '' unless $fl;
 301     return ($fl->srcfeature ? $fl->srcfeature->name : '<span class="ghosted">null</span>') . ':'. ($fl->fmin+1) . '..' . $fl->fmax;
 302 }
 303
 304
 305 ### XXX TODO: A lot of these _link and sequence functions need to be
 306 ### moved to controller code.
 307
 308 sub feature_link {
 309     my ($feature) = @_;
 310     return '<span class="ghosted">null</span>' unless $feature;
 311     my $id   = $feature->feature_id;
 312     my $name = $feature->name;
 313     return qq{<a href="/feature/$id/details">$name</a>};
 314 }
 315
 316 sub organism_link {
 317     my ($organism) = @_;
 318     my $id      = $organism->organism_id;
 319     my $species = $organism->species;
 320     return qq{<a class="species_binomial" href="/chado/organism.pl?organism_id=$id">$species</a>};
 321 }
 322
 323 sub cvterm_link {
 324     my ( $cvt, $caps ) = @_;
 325     my $name = cvterm_name( $cvt, $caps );
 326     my $id   = $cvt->id;
 327     return qq{<a href="/cvterm/$id/view">$name</a>};
 328 }
 329
 330 sub mrna_cds_protein_sequence {
 331     my ($mrna_feature) = @_;
 332
 333     # if we were actually passed a polypeptide, get its mrna(s) and
 334     # recurse
 335     if( $mrna_feature->type->name eq 'polypeptide' ) {
 336         return
 337             map mrna_cds_protein_sequence( $_ ),
 338             $mrna_feature->search_related('feature_relationship_subjects',
 339                     { 'me.type_id' => {
 340                         -in => $mrna_feature->result_source->schema
 341                                             ->resultset('Cv::Cvterm')
 342                                             ->search({name => 'derives_from'})
 343                                             ->get_column('cvterm_id')
 344                                             ->as_query,
 345                     },
 346                   },
 347                )
 348                ->search_related('object');
 349     }
 350
 351     my $description = join ', ', get_descriptions( $mrna_feature, 'no html' );
 352     my $peptide     = _peptides_rs( $mrna_feature )->first;
 353
 354     my @exon_locations = _exon_rs( $mrna_feature )->all;
 355     unless( @exon_locations ) {
 356         # cannot calculate the cds and protein without exons, because
 357         # UTRs can sometimes have introns in them.  without knowing
 358         # the exon structure, we don't know how much to cut off of the
 359         # UTRs
 360         return [
 361             $mrna_feature->subseq(1,1)        ? $mrna_feature : undef,
 362             undef,
 363             $peptide && $peptide->subseq(1,1) ? $peptide      : undef,
 364         ];
 365     }
 366
 367     my $mrna_seq = $mrna_feature->subseq(1,1) ? $mrna_feature : _make_mrna_seq( $mrna_feature, $description, \@exon_locations );
 368     my $peptide_loc = $peptide && _peptide_loc($peptide)->first;
 369
 370     # just return the mrna seq and nothing else if we have no peptide
 371     # or the peptide is not located
 372     unless( $peptide && $peptide_loc ) {
 373         return [ $mrna_seq, undef, undef ] unless $peptide && $peptide_loc;
 374     }
 375
 376     my $cds_seq = Bio::PrimarySeq->new(
 377         -id   => $mrna_seq->display_name,
 378         -desc => $description,
 379         -seq  => $mrna_seq->seq,
 380      );
 381     my ( $trim_from_left, $trim_from_right ) = _calculate_cdna_utr_lengths(
 382         _loc2range( $peptide_loc ),
 383         [ map _loc2range( $_), @exon_locations ],
 384      );
 385
 386     if( $trim_from_left || $trim_from_right ) {
 387         $cds_seq = $cds_seq->trunc( 1+$trim_from_left, $mrna_seq->length - $trim_from_right );
 388     }
 389
 390     ##Get the protein sequence from the peptide object (stored in the database in the residues field of the feature table)
 391     my $protein_seq  = Bio::PrimarySeq->new(
 392         -id   => $mrna_seq->display_name,
 393         -desc => $description,
 394         -seq  => $peptide->residues,
 395      );
 396
 397     #Get the protein seq from translated CDS if no residues are found for polypeptide in the DB
 398     if ( !$protein_seq->seq ) {
 399       $protein_seq = $cds_seq->translate;
 400     }
 401
 402     return [ $mrna_seq, $cds_seq, $protein_seq ];
 403 }
 404
 405 sub _make_mrna_seq {
 406     my ( $mrna_feat, $description, $exons ) = @_;
 407
 408     # NOTE: doing this subseq math in 0-based coords
 409     my $span_start  = $exons->[0]->fmin;
 410     my $span_end    = $exons->[-1]->fmax-1;
 411
 412     # 0 1 2 3 4 5 6 7 8  interbase (Chado)
 413     #  G|C|C|A|T|G|T|A
 414     #  0 1 2 3 4 5 6 7   0-based   (substr)
 415     #  1 2 3 4 5 6 7 8   1-based   (BioPerl)
 416
 417     # recall: the exons are in sorted order
 418     my $span_seq = $exons->[0]->srcfeature->subseq( $span_start+1, $span_end+1 ); #< 1-based
 419     my $mrna_sequence = join '', map { substr($span_seq, $_->fmin - $span_start, $_->fmax - $_->fmin ) } @$exons;
 420
 421     my $mrna_seq = Bio::PrimarySeq->new(
 422         -id   => $mrna_feat->name,
 423         -desc => $description,
 424         -seq  => $mrna_sequence,
 425     );
 426
 427     $mrna_seq = $mrna_seq->revcom if $exons->[0]->strand == -1;
 428
 429     return $mrna_seq;
 430 }
 431
 432 sub _loc2range {
 433     my ( $loc ) = @_;
 434     return $loc->to_range if $loc->can('to_range');
 435     return Bio::Range->new(
 436         -start  => $loc->fmin + 1,
 437         -end    => $loc->fmax,
 438         -strand => $loc->strand,
 439       );
 440 }
 441
 442 # given the range of the peptide and the ranges of each of the exons
 443 # (as Bio::RangeI's), calculate how many bases should be trimmed off
 444 # of each end of the cDNA (i.e. mRNA) seq to get the CDS seq
 445 sub _calculate_cdna_utr_lengths {
 446     my ( $peptide, $exons ) = @_;
 447
 448     my ( $trim_left, $trim_right ) = ( 0, 0 );
 449
 450     # calculate trim_fmin if necessary
 451     if( $exons->[0]->start < $peptide->start ) {
 452
 453         $trim_left =
 454             sum
 455             map {
 456                 $_->overlaps($peptide)
 457                     ? $peptide->start - $_->start
 458                     : $_->length
 459             }
 460             grep $_->start < $peptide->start, # find exons that overlap the UTR
 461             @$exons
 462     }
 463
 464     # calculate trim_fmax if necessary
 465     if( $exons->[-1]->end > $peptide->end ) {
 466         $trim_right =
 467             sum
 468             map {
 469                 $_->overlaps($peptide)
 470                     ? $_->end - $peptide->end
 471                     : $_->length
 472             }
 473             grep $_->end > $peptide->end, # find exons that overlap the UTR
 474             @$exons
 475     }
 476
 477     return $exons->[0]->strand == -1 ? ($trim_right, $trim_left) : ( $trim_left, $trim_right );
 478 }
 479
 480 sub _peptides_rs {
 481     my ( $mrna_feature ) = @_;
 482
 483     $mrna_feature
 484         ->feature_relationship_objects({
 485             'me.type_id' => {
 486                 -in => _cvterm_rs( $mrna_feature, 'relationship', 'derives_from' )
 487                          ->get_column('cvterm_id')
 488                          ->as_query,
 489             },
 490           })
 491         ->search_related( 'subject', {
 492             'subject.type_id' => {
 493                 -in => _cvterm_rs( $mrna_feature, 'sequence', 'polypeptide' )
 494                          ->get_column('cvterm_id')
 495                          ->as_query,
 496             },
 497            })
 498     }
 499 sub _peptide_loc {
 500     my ($rs) = @_;
 501     $rs->search_related( 'featureloc_features', {
 502             #srcfeature_id => { -not => undef },
 503              srcfeature_id => { -not => undef }, locgroup => 0
 504           },
 505           { # Don't prefetch srcfeatures, it significantly slows down the query
 506             # prefetch => 'srcfeature',
 507             order_by => 'fmin',
 508           },
 509          );
 510 }
 511
 512 sub _exon_rs {
 513     my ( $mrna_feature ) = @_;
 514
 515     my $rs = $mrna_feature->feature_relationship_objects({
 516             'me.type_id' => {
 517                 -in => _cvterm_rs( $mrna_feature, 'relationship', 'part_of' )
 518                          ->get_column('cvterm_id')
 519                          ->as_query,
 520             },
 521           },
 522           {
 523               prefetch => 'type',
 524           })
 525         ->search_related( 'subject', {
 526             'subject.type_id' => {
 527                 -in => _cvterm_rs( $mrna_feature, 'sequence', 'exon' )
 528                          ->get_column('cvterm_id')
 529                          ->as_query,
 530             },
 531            },
 532            {
 533                prefetch => 'featureloc_features',
 534            })
 535         ->search_related( 'featureloc_features', {
 536             #srcfeature_id => { -not => undef },
 537              srcfeature_id => { -not => undef }, locgroup => 0
 538           },
 539           {
 540             prefetch => 'srcfeature',
 541             order_by => 'fmin',
 542           },
 543          );
 544     return $rs;
 545 }
 546
 547 sub _cvterm_rs {
 548     my ( $row, $cv, $cvt ) = @_;
 549
 550     return $row->result_source->schema
 551                ->resultset('Cv::Cv')
 552                ->search({ 'me.name' => $cv })
 553                ->search_related('cvterms', {
 554                    'cvterms.name' => $cvt,
 555                  });
 556 }
 557
 558 1;