can download plant phenotype data in the same way as plot phenotype data
[sgn.git] / lib / SGN / View / Feature.pm
blob68bd9702cd40bbe9b40118009ef851e4619429d5
1 package SGN::View::Feature;
2 use strict;
3 use warnings;
5 use base 'Exporter';
7 use HTML::Entities;
8 use List::Util qw/ sum /;
9 use List::MoreUtils qw/ any uniq /;
11 use Bio::Seq;
13 use CXGN::Tools::Text qw/commify_number/;
14 use CXGN::Tools::Identifiers;
17 our @EXPORT_OK = qw/
18 related_stats feature_table
19 feature_link
20 cvterm_link
21 organism_link feature_length
22 mrna_cds_protein_sequence
23 description_featureprop_types
24 get_descriptions
25 location_list_html
26 location_list
27 location_string
28 location_string_html
29 type_name
32 sub type_name {
33 cvterm_name( shift->type, @_ );
36 sub cvterm_name {
37 my ($cvt, $caps) = @_;
38 ( my $n = $cvt->name ) =~ s/_/ /g;
39 if( $caps ) {
40 $n =~ s/(\S+)/lc($1) eq $1 ? ucfirst($1) : $1/e;
42 return $n;
45 sub description_featureprop_types {
46 shift->result_source->schema
47 ->resultset('Cv::Cvterm')
48 ->search({
49 name => [ 'Note',
50 'functional_description',
51 'Description',
52 'description',
57 sub get_descriptions {
58 my ( $feature, $plain ) = @_;
60 my $desc_types =
61 description_featureprop_types( $feature )
62 ->get_column('cvterm_id')
63 ->as_query;
65 my @descriptions =
66 $feature->search_related('featureprops',
67 { type_id => { -in => $desc_types } },
68 { order_by => 'rank' },
70 ->get_column('value')
71 ->all;
74 return @descriptions if defined $plain;
76 s/(\S+)/my $id = $1; CXGN::Tools::Identifiers::link_identifier($id) || $id/ge for @descriptions;
78 return @descriptions;
81 sub location_string_html {
82 my ( $id, $start, $end, $strand ) = @_;
83 if( @_ == 1 ) {
84 my $loc = shift;
85 $id = feature_link($loc->srcfeature);
86 $start = $loc->fmin+1;
87 $end = $loc->fmax;
88 $strand = $loc->strand;
90 ( $start, $end ) = ( $end, $start ) if $strand && $strand == -1;
91 return "$id:$start..$end";
94 sub location_string {
95 my ( $id, $start, $end, $strand ) = @_;
96 if( @_ == 1 ) {
97 my $loc = shift;
98 $id = $loc->srcfeature->name;
99 $start = $loc->fmin+1;
100 $end = $loc->fmax;
101 $strand = $loc->strand;
103 ( $start, $end ) = ( $end, $start ) if $strand && $strand == -1;
104 return "$id:$start..$end";
107 sub location_list_html {
108 my ($feature, $featurelocs) = @_;
109 my @coords = map { location_string_html($_) }
110 ( #$featurelocs ? $featurelocs->all
111 # : $feature->featureloc_features->all );
112 $featurelocs ? $featurelocs->search({locgroup => 0,},)->all
113 : $feature->featureloc_features->search({locgroup => 0,},)->all)
114 or return '<span class="ghosted">none</span>';
115 return @coords;
117 sub location_list {
118 my ($feature, $featurelocs) = @_;
119 print STDERR "\n\nLOCATON LIST\n\n";
120 return map { ($_->srcfeature ? $_->srcfeature->name : '<span class="ghosted">null</span>') . ':' . ($_->fmin+1) . '..' . $_->fmax }
121 ( #$featurelocs ? $featurelocs->all
122 # : $feature->featureloc_features->all );
123 $featurelocs ? $featurelocs->search({locgroup => 0,},)->all
124 : $feature->featureloc_features->search({locgroup => 0,},)->all );
127 sub related_stats {
128 my ($features) = @_;
129 my $stats = { };
130 my $total = scalar @$features;
131 for my $f (@$features) {
132 $stats->{cvterm_link($f->type)}++;
134 my $data = [ ];
135 for my $k (sort keys %$stats) {
136 push @$data, [ $stats->{$k}, $k ];
138 if( 1 < scalar keys %$stats ) {
139 push @$data, [ $total, "<b>Total</b>" ];
141 return $data;
144 sub feature_table {
145 my ( $features, $reference_sequence, $omit_columns ) = @_;
147 { no warnings 'uninitialized';
148 $omit_columns ||= [];
149 $omit_columns = [$omit_columns] unless ref $omit_columns eq 'ARRAY';
152 my @data;
153 for my $f (sort { $a->name cmp $b->name } @$features) {
154 my @ref_condition =
155 $reference_sequence ? ( srcfeature_id => $reference_sequence->feature_id )
156 : ();
158 my @locations = $f->search_related('featureloc_features', {
159 @ref_condition,
160 locgroup => 0,
162 { order_by => 'feature_id' }
165 if( @locations ) {
166 # Add a row for every featureloc
167 my $first_location = 0;
168 for my $loc (@locations) {
169 my $ref = $loc->srcfeature;
170 my ($start,$end) = ($loc->fmin+1, $loc->fmax);
171 push @data, [
172 ( $first_location++
173 ? ('','','')
174 : ( organism_link( $f->organism ),
175 cvterm_link($f->type),
176 feature_link($f),
179 ($ref ? $ref->name : '<span class="ghosted">null</span>').":$start..$end",
180 commify_number( feature_length( $f, $loc ) ) || undef,
181 $loc->strand ? ( $loc->strand == 1 ? '+' : '-' ) : undef,
182 $loc->phase || undef,
186 else {
187 my $nl = 'not located';
188 if( $reference_sequence ) {
189 $nl .= " on ".encode_entities( $reference_sequence->name )
191 push @data, [
192 organism_link( $f->organism ),
193 cvterm_link($f->type),
194 feature_link($f),
195 qq|<span class="ghosted">$nl</span>|,
196 commify_number( feature_length( $f, undef ) ) || undef,
197 undef,
198 undef,
203 my @headings = ( "Organism", "Type", "Name", "Location", "Length", "Strand", "Phase" );
205 my @align = map 'l', @headings;
207 # omit any columns that are *all* undefined, or that we were
208 # requested to omit
209 my @cols_to_omit = uniq(
210 do {
211 my %heading_index = do { my $i = 0; map { lc $_ => $i++ } @headings };
212 (map {
213 my $i = $heading_index{lc $_};
214 defined $i or die "$_ column not found";
216 } @$omit_columns
220 for my $t ( [\@headings], \@data, [\@align] ) {
221 for my $row ( @$t ) {
222 splice( @$row, $_, 1 ) for @cols_to_omit;
226 # make html for any other undef cells
227 for (@data) {
228 for (@$_) {
229 $_ = '<span class="ghosted">n/a</span>' unless defined;
233 return ( headings => \@headings, data => \@data, __align => \@align, __alt_freq => 0 , __border => 1 );
236 # try to figure out the "length" of a feature, which will vary for different features
237 sub feature_length {
238 my ( $feature, $location ) = @_;
240 $location = $location->first
241 if $location && $location->isa('DBIx::Class::ResultSet');
243 my $type = $feature->type;
244 my $type_name = $type->name;
246 # firstly, for any feature, trust the length of its residues if it has them
247 if( my $seqlen = $feature->seqlen || $feature->residues && length $feature->residues ) {
248 return $seqlen;
250 # for some features, can say that its length is the length of its location
251 elsif( any { $type_name eq $_ } qw( exon gene ) ) {
252 return unless $location;
253 return $location->fmax - $location->fmin;
255 return;
258 sub _feature_search_string {
259 my ($fl) = @_;
260 return '' unless $fl;
261 return ($fl->srcfeature ? $fl->srcfeature->name : '<span class="ghosted">null</span>') . ':'. ($fl->fmin+1) . '..' . $fl->fmax;
265 ### XXX TODO: A lot of these _link and sequence functions need to be
266 ### moved to controller code.
268 sub feature_link {
269 my ($feature) = @_;
270 return '<span class="ghosted">null</span>' unless $feature;
271 my $id = $feature->feature_id;
272 my $name = $feature->name;
273 return qq{<a href="/feature/$id/details">$name</a>};
276 sub organism_link {
277 my ($organism) = @_;
278 my $id = $organism->organism_id;
279 my $species = $organism->species;
280 return qq{<a class="species_binomial" href="/chado/organism.pl?organism_id=$id">$species</a>};
283 sub cvterm_link {
284 my ( $cvt, $caps ) = @_;
285 my $name = cvterm_name( $cvt, $caps );
286 my $id = $cvt->id;
287 return qq{<a href="/cvterm/$id/view">$name</a>};
290 sub mrna_cds_protein_sequence {
291 my ($mrna_feature) = @_;
293 # if we were actually passed a polypeptide, get its mrna(s) and
294 # recurse
295 if( $mrna_feature->type->name eq 'polypeptide' ) {
296 return
297 map mrna_cds_protein_sequence( $_ ),
298 $mrna_feature->search_related('feature_relationship_subjects',
299 { 'me.type_id' => {
300 -in => $mrna_feature->result_source->schema
301 ->resultset('Cv::Cvterm')
302 ->search({name => 'derives_from'})
303 ->get_column('cvterm_id')
304 ->as_query,
308 ->search_related('object');
311 my $description = join ', ', get_descriptions( $mrna_feature, 'no html' );
312 my $peptide = _peptides_rs( $mrna_feature )->first;
314 my @exon_locations = _exon_rs( $mrna_feature )->all;
315 unless( @exon_locations ) {
316 # cannot calculate the cds and protein without exons, because
317 # UTRs can sometimes have introns in them. without knowing
318 # the exon structure, we don't know how much to cut off of the
319 # UTRs
320 return [
321 $mrna_feature->subseq(1,1) ? $mrna_feature : undef,
322 undef,
323 $peptide && $peptide->subseq(1,1) ? $peptide : undef,
327 my $mrna_seq = $mrna_feature->subseq(1,1) ? $mrna_feature : _make_mrna_seq( $mrna_feature, $description, \@exon_locations );
328 my $peptide_loc = $peptide && _peptide_loc($peptide)->first;
330 # just return the mrna seq and nothing else if we have no peptide
331 # or the peptide is not located
332 unless( $peptide && $peptide_loc ) {
333 return [ $mrna_seq, undef, undef ] unless $peptide && $peptide_loc;
336 my $cds_seq = Bio::PrimarySeq->new(
337 -id => $mrna_seq->display_name,
338 -desc => $description,
339 -seq => $mrna_seq->seq,
341 my ( $trim_from_left, $trim_from_right ) = _calculate_cdna_utr_lengths(
342 _loc2range( $peptide_loc ),
343 [ map _loc2range( $_), @exon_locations ],
346 if( $trim_from_left || $trim_from_right ) {
347 $cds_seq = $cds_seq->trunc( 1+$trim_from_left, $mrna_seq->length - $trim_from_right );
349 ##my $protein_seq = $cds_seq->translate;
350 ##Get the protein sequence from the peptide object (stored in the database in the residues field of the feature table)
351 ## No need to try to translade the CDS
352 my $protein_seq = Bio::PrimarySeq->new(
353 -id => $mrna_seq->display_name,
354 -desc => $description,
355 -seq => $peptide->residues,
358 return [ $mrna_seq, $cds_seq, $protein_seq ];
361 sub _make_mrna_seq {
362 my ( $mrna_feat, $description, $exons ) = @_;
364 # NOTE: doing this subseq math in 0-based coords
365 my $span_start = $exons->[0]->fmin;
366 my $span_end = $exons->[-1]->fmax-1;
368 # 0 1 2 3 4 5 6 7 8 interbase (Chado)
369 # G|C|C|A|T|G|T|A
370 # 0 1 2 3 4 5 6 7 0-based (substr)
371 # 1 2 3 4 5 6 7 8 1-based (BioPerl)
373 # recall: the exons are in sorted order
374 my $span_seq = $exons->[0]->srcfeature->subseq( $span_start+1, $span_end+1 ); #< 1-based
375 my $mrna_sequence = join '', map { substr($span_seq, $_->fmin - $span_start, $_->fmax - $_->fmin ) } @$exons;
377 my $mrna_seq = Bio::PrimarySeq->new(
378 -id => $mrna_feat->name,
379 -desc => $description,
380 -seq => $mrna_sequence,
383 $mrna_seq = $mrna_seq->revcom if $exons->[0]->strand == -1;
385 return $mrna_seq;
388 sub _loc2range {
389 my ( $loc ) = @_;
390 return $loc->to_range if $loc->can('to_range');
391 return Bio::Range->new(
392 -start => $loc->fmin + 1,
393 -end => $loc->fmax,
394 -strand => $loc->strand,
398 # given the range of the peptide and the ranges of each of the exons
399 # (as Bio::RangeI's), calculate how many bases should be trimmed off
400 # of each end of the cDNA (i.e. mRNA) seq to get the CDS seq
401 sub _calculate_cdna_utr_lengths {
402 my ( $peptide, $exons ) = @_;
404 my ( $trim_left, $trim_right ) = ( 0, 0 );
406 # calculate trim_fmin if necessary
407 if( $exons->[0]->start < $peptide->start ) {
409 $trim_left =
411 map {
412 $_->overlaps($peptide)
413 ? $peptide->start - $_->start
414 : $_->length
416 grep $_->start < $peptide->start, # find exons that overlap the UTR
417 @$exons
420 # calculate trim_fmax if necessary
421 if( $exons->[-1]->end > $peptide->end ) {
422 $trim_right =
424 map {
425 $_->overlaps($peptide)
426 ? $_->end - $peptide->end
427 : $_->length
429 grep $_->end > $peptide->end, # find exons that overlap the UTR
430 @$exons
433 return $exons->[0]->strand == -1 ? ($trim_right, $trim_left) : ( $trim_left, $trim_right );
436 sub _peptides_rs {
437 my ( $mrna_feature ) = @_;
439 $mrna_feature
440 ->feature_relationship_objects({
441 'me.type_id' => {
442 -in => _cvterm_rs( $mrna_feature, 'relationship', 'derives_from' )
443 ->get_column('cvterm_id')
444 ->as_query,
447 ->search_related( 'subject', {
448 'subject.type_id' => {
449 -in => _cvterm_rs( $mrna_feature, 'sequence', 'polypeptide' )
450 ->get_column('cvterm_id')
451 ->as_query,
455 sub _peptide_loc {
456 my ($rs) = @_;
457 $rs->search_related( 'featureloc_features', {
458 #srcfeature_id => { -not => undef },
459 srcfeature_id => { -not => undef }, locgroup => 0
461 { # Don't prefetch srcfeatures, it significantly slows down the query
462 # prefetch => 'srcfeature',
463 order_by => 'fmin',
468 sub _exon_rs {
469 my ( $mrna_feature ) = @_;
471 my $rs = $mrna_feature->feature_relationship_objects({
472 'me.type_id' => {
473 -in => _cvterm_rs( $mrna_feature, 'relationship', 'part_of' )
474 ->get_column('cvterm_id')
475 ->as_query,
479 prefetch => 'type',
481 ->search_related( 'subject', {
482 'subject.type_id' => {
483 -in => _cvterm_rs( $mrna_feature, 'sequence', 'exon' )
484 ->get_column('cvterm_id')
485 ->as_query,
489 prefetch => 'featureloc_features',
491 ->search_related( 'featureloc_features', {
492 #srcfeature_id => { -not => undef },
493 srcfeature_id => { -not => undef }, locgroup => 0
496 prefetch => 'srcfeature',
497 order_by => 'fmin',
500 return $rs;
503 sub _cvterm_rs {
504 my ( $row, $cv, $cvt ) = @_;
506 return $row->result_source->schema
507 ->resultset('Cv::Cv')
508 ->search({ 'me.name' => $cv })
509 ->search_related('cvterms', {
510 'cvterms.name' => $cvt,