Merge pull request #5205 from solgenomics/topic/generic_trial_upload
[sgn.git] / lib / SGN / View / Feature.pm
blob0716a8933716d3e4f3b60a80882c0f15260d9f24
1 package SGN::View::Feature;
2 use strict;
3 use warnings;
5 use base 'Exporter';
7 use HTML::Entities;
8 use List::Util qw/ sum /;
9 use List::MoreUtils qw/ any uniq /;
11 use Bio::Seq;
13 use CXGN::Tools::Text qw/commify_number/;
14 use CXGN::Tools::Identifiers;
17 our @EXPORT_OK = qw/
18 related_stats feature_table
19 feature_link
20 cvterm_link
21 organism_link feature_length
22 mrna_cds_protein_sequence
23 description_featureprop_types
24 get_descriptions
25 location_list_html
26 location_list
27 location_string
28 location_string_html
29 type_name
30 feature_types
31 feature_organisms
36 sub feature_organisms {
37 my ($schema) = @_;
38 return [
39 [ 0, '' ],
40 map [ $_->organism_id, $_->species ],
41 $schema
42 ->resultset('Sequence::Feature')
43 ->search_related('organism' , {}, {
44 select => [qw[ organism.organism_id species ]],
45 distinct => 1,
46 order_by => 'species',
51 sub feature_types {
52 my ($schema) = @_;
54 my $ref = [
55 map [$_->cvterm_id,$_->name],
56 $schema
57 ->resultset('Sequence::Feature')
58 ->search_related(
59 'type',
60 {},
61 { select => [qw[ cvterm_id type.name ]],
62 group_by => [qw[ cvterm_id type.name ]],
63 order_by => 'type.name',
67 # add an empty option
68 unshift @$ref , ['0', ''];
69 return $ref;
72 sub type_name {
73 cvterm_name( shift->type, @_ );
76 sub cvterm_name {
77 my ($cvt, $caps) = @_;
78 ( my $n = $cvt->name ) =~ s/_/ /g;
79 if( $caps ) {
80 $n =~ s/(\S+)/lc($1) eq $1 ? ucfirst($1) : $1/e;
82 return $n;
85 sub description_featureprop_types {
86 shift->result_source->schema
87 ->resultset('Cv::Cvterm')
88 ->search({
89 name => [ 'Note',
90 'functional_description',
91 'Description',
92 'description',
97 sub get_descriptions {
98 my ( $feature, $plain ) = @_;
100 my $desc_types =
101 description_featureprop_types( $feature )
102 ->get_column('cvterm_id')
103 ->as_query;
105 my @descriptions =
106 $feature->search_related('featureprops',
107 { type_id => { -in => $desc_types } },
108 { order_by => 'rank' },
110 ->get_column('value')
111 ->all;
114 return @descriptions if defined $plain;
116 s/(\S+)/my $id = $1; CXGN::Tools::Identifiers::link_identifier($id) || $id/ge for @descriptions;
118 return @descriptions;
121 sub location_string_html {
122 my ( $id, $start, $end, $strand ) = @_;
123 if( @_ == 1 ) {
124 my $loc = shift;
125 $id = feature_link($loc->srcfeature);
126 $start = $loc->fmin+1;
127 $end = $loc->fmax;
128 $strand = $loc->strand;
130 ( $start, $end ) = ( $end, $start ) if $strand && $strand == -1;
131 return "$id:$start..$end";
134 sub location_string {
135 my ( $id, $start, $end, $strand ) = @_;
136 if( @_ == 1 ) {
137 my $loc = shift;
138 $id = $loc->srcfeature->name;
139 $start = $loc->fmin+1;
140 $end = $loc->fmax;
141 $strand = $loc->strand;
143 ( $start, $end ) = ( $end, $start ) if $strand && $strand == -1;
144 return "$id:$start..$end";
147 sub location_list_html {
148 my ($feature, $featurelocs) = @_;
149 my @coords = map { location_string_html($_) }
150 ( #$featurelocs ? $featurelocs->all
151 # : $feature->featureloc_features->all );
152 $featurelocs ? $featurelocs->search({locgroup => 0,},)->all
153 : $feature->featureloc_features->search({locgroup => 0,},)->all)
154 or return '<span class="ghosted">none</span>';
155 return @coords;
157 sub location_list {
158 my ($feature, $featurelocs) = @_;
159 print STDERR "\n\nLOCATON LIST\n\n";
160 return map { ($_->srcfeature ? $_->srcfeature->name : '<span class="ghosted">null</span>') . ':' . ($_->fmin+1) . '..' . $_->fmax }
161 ( #$featurelocs ? $featurelocs->all
162 # : $feature->featureloc_features->all );
163 $featurelocs ? $featurelocs->search({locgroup => 0,},)->all
164 : $feature->featureloc_features->search({locgroup => 0,},)->all );
167 sub related_stats {
168 my ($features) = @_;
169 my $stats = { };
170 my $total = scalar @$features;
171 for my $f (@$features) {
172 $stats->{cvterm_link($f->type)}++;
174 my $data = [ ];
175 for my $k (sort keys %$stats) {
176 push @$data, [ $stats->{$k}, $k ];
178 if( 1 < scalar keys %$stats ) {
179 push @$data, [ $total, "<b>Total</b>" ];
181 return $data;
184 sub feature_table {
185 my ( $features, $reference_sequence, $omit_columns ) = @_;
187 { no warnings 'uninitialized';
188 $omit_columns ||= [];
189 $omit_columns = [$omit_columns] unless ref $omit_columns eq 'ARRAY';
192 my @data;
193 for my $f (sort { $a->name cmp $b->name } @$features) {
194 my @ref_condition =
195 $reference_sequence ? ( srcfeature_id => $reference_sequence->feature_id )
196 : ();
198 my @locations = $f->search_related('featureloc_features', {
199 @ref_condition,
200 locgroup => 0,
202 { order_by => 'feature_id' }
205 if( @locations ) {
206 # Add a row for every featureloc
207 my $first_location = 0;
208 for my $loc (@locations) {
209 my $ref = $loc->srcfeature;
210 my ($start,$end) = ($loc->fmin+1, $loc->fmax);
211 push @data, [
212 ( $first_location++
213 ? ('','','')
214 : ( organism_link( $f->organism ),
215 cvterm_link($f->type),
216 feature_link($f),
219 ($ref ? $ref->name : '<span class="ghosted">null</span>').":$start..$end",
220 commify_number( feature_length( $f, $loc ) ) || undef,
221 $loc->strand ? ( $loc->strand == 1 ? '+' : '-' ) : undef,
222 $loc->phase || undef,
226 else {
227 my $nl = 'not located';
228 if( $reference_sequence ) {
229 $nl .= " on ".encode_entities( $reference_sequence->name )
231 push @data, [
232 organism_link( $f->organism ),
233 cvterm_link($f->type),
234 feature_link($f),
235 qq|<span class="ghosted">$nl</span>|,
236 commify_number( feature_length( $f, undef ) ) || undef,
237 undef,
238 undef,
243 my @headings = ( "Organism", "Type", "Name", "Location", "Length", "Strand", "Phase" );
245 my @align = map 'l', @headings;
247 # omit any columns that are *all* undefined, or that we were
248 # requested to omit
249 my @cols_to_omit = uniq(
250 do {
251 my %heading_index = do { my $i = 0; map { lc $_ => $i++ } @headings };
252 (map {
253 my $i = $heading_index{lc $_};
254 defined $i or die "$_ column not found";
256 } @$omit_columns
260 for my $t ( [\@headings], \@data, [\@align] ) {
261 for my $row ( @$t ) {
262 splice( @$row, $_, 1 ) for @cols_to_omit;
266 # make html for any other undef cells
267 for (@data) {
268 for (@$_) {
269 $_ = '<span class="ghosted">n/a</span>' unless defined;
273 return ( headings => \@headings, data => \@data, __align => \@align, __alt_freq => 0 , __border => 1 );
276 # try to figure out the "length" of a feature, which will vary for different features
277 sub feature_length {
278 my ( $feature, $location ) = @_;
280 $location = $location->first
281 if $location && $location->isa('DBIx::Class::ResultSet');
283 my $type = $feature->type;
284 my $type_name = $type->name;
286 # firstly, for any feature, trust the length of its residues if it has them
287 if( my $seqlen = $feature->seqlen || $feature->residues && length $feature->residues ) {
288 return $seqlen;
290 # for some features, can say that its length is the length of its location
291 elsif( any { $type_name eq $_ } qw( exon gene ) ) {
292 return unless $location;
293 return $location->fmax - $location->fmin;
295 return;
298 sub _feature_search_string {
299 my ($fl) = @_;
300 return '' unless $fl;
301 return ($fl->srcfeature ? $fl->srcfeature->name : '<span class="ghosted">null</span>') . ':'. ($fl->fmin+1) . '..' . $fl->fmax;
305 ### XXX TODO: A lot of these _link and sequence functions need to be
306 ### moved to controller code.
308 sub feature_link {
309 my ($feature) = @_;
310 return '<span class="ghosted">null</span>' unless $feature;
311 my $id = $feature->feature_id;
312 my $name = $feature->name;
313 return qq{<a href="/feature/$id/details">$name</a>};
316 sub organism_link {
317 my ($organism) = @_;
318 my $id = $organism->organism_id;
319 my $species = $organism->species;
320 return qq{<a class="species_binomial" href="/chado/organism.pl?organism_id=$id">$species</a>};
323 sub cvterm_link {
324 my ( $cvt, $caps ) = @_;
325 my $name = cvterm_name( $cvt, $caps );
326 my $id = $cvt->id;
327 return qq{<a href="/cvterm/$id/view">$name</a>};
330 sub mrna_cds_protein_sequence {
331 my ($mrna_feature) = @_;
333 # if we were actually passed a polypeptide, get its mrna(s) and
334 # recurse
335 if( $mrna_feature->type->name eq 'polypeptide' ) {
336 return
337 map mrna_cds_protein_sequence( $_ ),
338 $mrna_feature->search_related('feature_relationship_subjects',
339 { 'me.type_id' => {
340 -in => $mrna_feature->result_source->schema
341 ->resultset('Cv::Cvterm')
342 ->search({name => 'derives_from'})
343 ->get_column('cvterm_id')
344 ->as_query,
348 ->search_related('object');
351 my $description = join ', ', get_descriptions( $mrna_feature, 'no html' );
352 my $peptide = _peptides_rs( $mrna_feature )->first;
354 my @exon_locations = _exon_rs( $mrna_feature )->all;
355 unless( @exon_locations ) {
356 # cannot calculate the cds and protein without exons, because
357 # UTRs can sometimes have introns in them. without knowing
358 # the exon structure, we don't know how much to cut off of the
359 # UTRs
360 return [
361 $mrna_feature->subseq(1,1) ? $mrna_feature : undef,
362 undef,
363 $peptide && $peptide->subseq(1,1) ? $peptide : undef,
367 my $mrna_seq = $mrna_feature->subseq(1,1) ? $mrna_feature : _make_mrna_seq( $mrna_feature, $description, \@exon_locations );
368 my $peptide_loc = $peptide && _peptide_loc($peptide)->first;
370 # just return the mrna seq and nothing else if we have no peptide
371 # or the peptide is not located
372 unless( $peptide && $peptide_loc ) {
373 return [ $mrna_seq, undef, undef ] unless $peptide && $peptide_loc;
376 my $cds_seq = Bio::PrimarySeq->new(
377 -id => $mrna_seq->display_name,
378 -desc => $description,
379 -seq => $mrna_seq->seq,
381 my ( $trim_from_left, $trim_from_right ) = _calculate_cdna_utr_lengths(
382 _loc2range( $peptide_loc ),
383 [ map _loc2range( $_), @exon_locations ],
386 if( $trim_from_left || $trim_from_right ) {
387 $cds_seq = $cds_seq->trunc( 1+$trim_from_left, $mrna_seq->length - $trim_from_right );
390 ##Get the protein sequence from the peptide object (stored in the database in the residues field of the feature table)
391 my $protein_seq = Bio::PrimarySeq->new(
392 -id => $mrna_seq->display_name,
393 -desc => $description,
394 -seq => $peptide->residues,
397 #Get the protein seq from translated CDS if no residues are found for polypeptide in the DB
398 if ( !$protein_seq->seq ) {
399 $protein_seq = $cds_seq->translate;
402 return [ $mrna_seq, $cds_seq, $protein_seq ];
405 sub _make_mrna_seq {
406 my ( $mrna_feat, $description, $exons ) = @_;
408 # NOTE: doing this subseq math in 0-based coords
409 my $span_start = $exons->[0]->fmin;
410 my $span_end = $exons->[-1]->fmax-1;
412 # 0 1 2 3 4 5 6 7 8 interbase (Chado)
413 # G|C|C|A|T|G|T|A
414 # 0 1 2 3 4 5 6 7 0-based (substr)
415 # 1 2 3 4 5 6 7 8 1-based (BioPerl)
417 # recall: the exons are in sorted order
418 my $span_seq = $exons->[0]->srcfeature->subseq( $span_start+1, $span_end+1 ); #< 1-based
419 my $mrna_sequence = join '', map { substr($span_seq, $_->fmin - $span_start, $_->fmax - $_->fmin ) } @$exons;
421 my $mrna_seq = Bio::PrimarySeq->new(
422 -id => $mrna_feat->name,
423 -desc => $description,
424 -seq => $mrna_sequence,
427 $mrna_seq = $mrna_seq->revcom if $exons->[0]->strand == -1;
429 return $mrna_seq;
432 sub _loc2range {
433 my ( $loc ) = @_;
434 return $loc->to_range if $loc->can('to_range');
435 return Bio::Range->new(
436 -start => $loc->fmin + 1,
437 -end => $loc->fmax,
438 -strand => $loc->strand,
442 # given the range of the peptide and the ranges of each of the exons
443 # (as Bio::RangeI's), calculate how many bases should be trimmed off
444 # of each end of the cDNA (i.e. mRNA) seq to get the CDS seq
445 sub _calculate_cdna_utr_lengths {
446 my ( $peptide, $exons ) = @_;
448 my ( $trim_left, $trim_right ) = ( 0, 0 );
450 # calculate trim_fmin if necessary
451 if( $exons->[0]->start < $peptide->start ) {
453 $trim_left =
455 map {
456 $_->overlaps($peptide)
457 ? $peptide->start - $_->start
458 : $_->length
460 grep $_->start < $peptide->start, # find exons that overlap the UTR
461 @$exons
464 # calculate trim_fmax if necessary
465 if( $exons->[-1]->end > $peptide->end ) {
466 $trim_right =
468 map {
469 $_->overlaps($peptide)
470 ? $_->end - $peptide->end
471 : $_->length
473 grep $_->end > $peptide->end, # find exons that overlap the UTR
474 @$exons
477 return $exons->[0]->strand == -1 ? ($trim_right, $trim_left) : ( $trim_left, $trim_right );
480 sub _peptides_rs {
481 my ( $mrna_feature ) = @_;
483 $mrna_feature
484 ->feature_relationship_objects({
485 'me.type_id' => {
486 -in => _cvterm_rs( $mrna_feature, 'relationship', 'derives_from' )
487 ->get_column('cvterm_id')
488 ->as_query,
491 ->search_related( 'subject', {
492 'subject.type_id' => {
493 -in => _cvterm_rs( $mrna_feature, 'sequence', 'polypeptide' )
494 ->get_column('cvterm_id')
495 ->as_query,
499 sub _peptide_loc {
500 my ($rs) = @_;
501 $rs->search_related( 'featureloc_features', {
502 #srcfeature_id => { -not => undef },
503 srcfeature_id => { -not => undef }, locgroup => 0
505 { # Don't prefetch srcfeatures, it significantly slows down the query
506 # prefetch => 'srcfeature',
507 order_by => 'fmin',
512 sub _exon_rs {
513 my ( $mrna_feature ) = @_;
515 my $rs = $mrna_feature->feature_relationship_objects({
516 'me.type_id' => {
517 -in => _cvterm_rs( $mrna_feature, 'relationship', 'part_of' )
518 ->get_column('cvterm_id')
519 ->as_query,
523 prefetch => 'type',
525 ->search_related( 'subject', {
526 'subject.type_id' => {
527 -in => _cvterm_rs( $mrna_feature, 'sequence', 'exon' )
528 ->get_column('cvterm_id')
529 ->as_query,
533 prefetch => 'featureloc_features',
535 ->search_related( 'featureloc_features', {
536 #srcfeature_id => { -not => undef },
537 srcfeature_id => { -not => undef }, locgroup => 0
540 prefetch => 'srcfeature',
541 order_by => 'fmin',
544 return $rs;
547 sub _cvterm_rs {
548 my ( $row, $cv, $cvt ) = @_;
550 return $row->result_source->schema
551 ->resultset('Cv::Cv')
552 ->search({ 'me.name' => $cv })
553 ->search_related('cvterms', {
554 'cvterms.name' => $cvt,