bump this version to avoid any collisions
[bioperl-live.git] / Bio / Assembly / Scaffold.pm
blob9bbfadc897f95a6890f81564945d17f3ea378533
2 # BioPerl module for Bio::Assembly::Scaffold
4 # Copyright by Robson F. de Souza
6 # You may distribute this module under the same terms as perl itself
8 # POD documentation - main docs before the code
10 =head1 NAME
12 Bio::Assembly::Scaffold - Perl module to hold and manipulate sequence assembly
13 data.
15 =head1 SYNOPSIS
17 # Module loading
18 use Bio::Assembly::IO;
20 # Assembly loading methods
21 my $aio = Bio::Assembly::IO->new(-file=>"test.ace.1", -format=>'phrap');
22 my $assembly = $aio->next_assembly;
24 foreach my $contig ($assembly->all_contigs) {
25 # do something... (see Bio::Assembly::Contig)
28 =head1 DESCRIPTION
30 Bio::Assembly::Scaffold was developed to store and manipulate data
31 from sequence assembly programs like Phrap. It implements the
32 ScaffoldI interface and intends to be generic enough to be used by
33 Bio::Assembly::IO drivers written to programs other than Phrap.
35 =head1 FEEDBACK
37 =head2 Mailing Lists
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to the
41 Bioperl mailing lists Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
46 =head2 Support
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
57 =head2 Reporting Bugs
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 the bugs and their resolution. Bug reports can be submitted via the
61 web:
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 AUTHOR - Robson Francisco de Souza
67 rfsouza@citri.iq.usp.br
69 =head1 APPENDIX
71 The rest of the documentation details each of the object
72 methods. Internal methods are usually preceded with a _
74 =cut
76 package Bio::Assembly::Scaffold;
78 use strict;
79 use Bio::Annotation::Collection;
81 use base qw(Bio::Root::Root Bio::Assembly::ScaffoldI);
83 =head2 new ()
85 Title : new
86 Usage : $scaffold = new ( -id => "assembly 1",
87 -source => 'program_name',
88 -contigs => \@contigs,
89 -singlets => \@singlets );
90 Function: creates a new scaffold object
91 Returns : Bio::Assembly::Scaffold
92 Args : -id : [string] scaffold name
93 -source : [string] sequence assembly program
94 -contigs : reference to array of Bio::Assembly::Contig objects
95 -singlets : reference to array of Bio::Assembly::Singlet objects
98 =cut
100 sub new {
101 my($class, @args) = @_;
102 my $self = $class->SUPER::new(@args);
103 my ($id, $src, $contigs, $singlets) = $self->_rearrange(
104 [qw(ID SOURCE CONTIGS SINGLETS)], @args);
106 # Scaffold defaults
107 $self->{'_id'} = 'NoName';
108 $self->{'_source'} = 'Unknown';
109 $self->{'_contigs'} = {};
110 $self->{'_singlets'} = {};
111 $self->{'_seqs'} = {};
112 $self->{'_annotation'} = Bio::Annotation::Collection->new();
114 # Import manual info
115 $self->{'_id'} = $id if (defined $id);
116 $self->{'_source'} = $src if (defined $src);
118 # Add contigs and singlets to scaffold
119 if (defined $contigs && ref($contigs = 'ARRAY')) {
120 for my $contig (@{$contigs}) {
121 if( ! ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) {
122 $self->throw("Bio::Assembly::Scaffold::new is unable to process non".
123 "Bio::Assembly::Contig object [", ref($contig), "]");
125 $self->add_contig($contig);
128 if (defined $singlets && ref($singlets = 'ARRAY')) {
129 for my $singlet (@{$singlets}) {
130 if( ! ref $singlet || ! $singlet->isa('Bio::Assembly::Singlet') ) {
131 $self->throw("Bio::Assembly::Scaffold::new is unable to process non".
132 "Bio::Assembly::Singlet object [", ref($singlet), "]");
134 $self->add_singlet($singlet);
138 return $self;
141 =head1 Accessing general assembly data
143 =cut
145 =head2 id
147 Title : id
148 Usage : $assembly->id()
149 Function: Get/Set assembly ID
150 Returns : string or undef
151 Args : string
153 =cut
155 sub id {
156 my ($self, $id) = @_;
157 return $self->{'_id'} = $id if (defined $id);
158 return $self->{'_id'};
161 =head2 annotation
163 Title : annotation
164 Usage : $assembly->annotation()
165 Function: Get/Set assembly annotation object
166 Returns : Bio::Annotation::Collection
167 Args : none
169 =cut
171 sub annotation {
172 my ($self, $ref) = shift;
173 $self->{'_annotation'} = $ref if (defined $ref);
174 return $self->{'_annotation'};
177 =head2 get_nof_contigs
179 Title : get_nof_contigs
180 Usage : $assembly->get_nof_contigs()
181 Function: Get the number of contigs included in the scaffold
182 Returns : integer
183 Args : none
185 =cut
187 sub get_nof_contigs {
188 my $self = shift;
189 return scalar( $self->get_contig_ids() );
192 =head2 get_nof_contig_seqs
194 Title : get_nof_contig_seqs
195 Usage : $assembly->get_nof_contig_seqs()
196 Function: Get the number of sequences included in contigs of the
197 scaffold (no consensus sequences or singlets)
198 Returns : integer
199 Args : none
201 =cut
203 sub get_nof_contig_seqs {
204 my $self = shift;
206 my $nof_seqs = 0;
207 foreach my $contig ($self->all_contigs) {
208 $nof_seqs += scalar( $contig->get_seq_ids() );
211 return $nof_seqs;
213 # function alias for backward compatibility
214 *get_nof_sequences_in_contigs = \&get_nof_contig_seqs;
217 =head2 get_nof_singlets (get_nof_singlet_seqs)
219 Title : nof_singlets
220 Usage : $assembly->nof_singlets()
221 Function: Get the number of singlets included in the assembly
222 Returns : integer
223 Args : none
225 =cut
227 sub get_nof_singlets {
228 my $self = shift;
229 return scalar( $self->get_singlet_ids() );
231 *get_nof_singlet_seqs = \&get_nof_singlets;
233 =head2 get_all_seq_ids
235 Title : get_all_seq_ids
236 Usage : $assembly->get_all_seq_ids()
237 Function: Get the ID of all sequences making up the scaffold
238 (sequences from contigs and singlets, not consensus).
239 Returns : array of strings
240 Args : none
242 =cut
244 sub get_all_seq_ids {
245 my $self = shift;
246 return keys %{ $self->{'_seqs'} };
249 =head2 get_nof_seqs
251 Title : get_nof_seqs
252 Usage : $assembly->get_nof_seqs()
253 Function: Get total number of sequences making up the scaffold
254 (sequences from contigs and singlets, not consensus).
255 Returns : integer
256 Args : none
258 =cut
260 sub get_nof_seqs {
261 my $self = shift;
262 return scalar $self->get_all_seq_ids;
265 =head2 get_contig_seq_ids
267 Title : get_contig_seq_ids
268 Usage : $assembly->get_contig_seq_ids()
269 Function: Get the ID of all sequences in contigs
270 Returns : array of strings
271 Args : none
273 =cut
275 sub get_contig_seq_ids {
276 my $self = shift;
277 my @ids;
278 for my $contig ( $self->all_contigs ) {
279 push @ids, $contig->get_seq_ids;
281 return @ids;
283 # function alias for backward compatibility
284 *get_seq_ids = \&get_contig_seq_ids;
286 =head2 get_contig_ids
288 Title : get_contig_ids
289 Usage : $assembly->get_contig_ids()
290 Function: Access list of contig IDs from assembly
291 Returns : an array, if there are any contigs in the
292 assembly. An empty array otherwise
293 Args : none
295 =cut
297 sub get_contig_ids {
298 my $self = shift;
300 return wantarray
301 ? sort keys %{$self->{'_contigs'}}
302 : scalar keys %{$self->{'_contigs'}};
305 =head2 get_singlet_ids (get_singlet_seq_ids)
307 Title : get_singlet_ids
308 Usage : $assembly->get_singlet_ids()
309 Function: Access list of singlet IDs from assembly
310 Returns : array of strings if there are any singlets
311 otherwise an empty array
312 Args : none
314 =cut
316 sub get_singlet_ids {
317 my $self = shift;
319 return wantarray
320 ? sort keys %{$self->{'_singlets'}}
321 : scalar keys %{$self->{'_singlets'}};
323 *get_singlet_seq_ids = \&get_singlet_ids;
325 =head2 get_seq_by_id
327 Title : get_seq_by_id
328 Usage : $assembly->get_seq_by_id($id)
329 Function: Get a reference for an sequence making up the scaffold
330 (from a contig or singlet, not consensus)
331 Returns : a Bio::LocatableSeq object
332 undef if sequence $id is not found in the scaffold
333 Args : [string] sequence identifier (id)
335 =cut
337 sub get_seq_by_id {
338 my $self = shift;
339 my $seqID = shift;
341 return unless (exists $self->{'_seqs'}{$seqID});
343 return $self->{'_seqs'}{$seqID}->get_seq_by_name($seqID);
346 =head2 get_contig_by_id
348 Title : get_contig_by_id
349 Usage : $assembly->get_contig_by_id($id)
350 Function: Get a reference for a contig
351 Returns : a Bio::Assembly::Contig object or undef
352 Args : [string] contig unique identifier (ID)
354 =cut
356 sub get_contig_by_id {
357 my $self = shift;
358 my $contigID = shift;
360 return unless (exists $self->{'_contigs'}{$contigID});
362 return $self->{'_contigs'}{$contigID};
365 =head2 get_singlet_by_id
367 Title : get_singlet_by_id
368 Usage : $assembly->get_singlet_by_id()
369 Function: Get a reference for a singlet
370 Returns : Bio::Assembly::Singlet object or undef
371 Args : [string] a singlet ID
373 =cut
375 sub get_singlet_by_id {
376 my $self = shift;
378 my $singletID = shift;
380 return unless (exists $self->{'_singlets'}{$singletID});
382 return $self->{'_singlets'}{$singletID};
385 =head1 Modifier methods
387 =cut
389 =head2 add_contig
391 Title : add_contig
392 Usage : $assembly->add_contig($contig)
393 Function: Add a contig to the assembly
394 Returns : 1 on success
395 Args : a Bio::Assembly::Contig object
396 order (optional)
398 =cut
400 sub add_contig {
401 my ($self, $contig) = @_;
403 # Input check
404 if( !ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) {
405 $self->throw("Bio::Assembly::Scaffold::add_contig is unable to process".
406 " non Bio::Assembly::Contig object [", ref($contig), "]");
409 # Create and attribute contig ID
410 my $contigID = $contig->id();
411 if( !defined $contigID ) {
412 $contigID = 'Unknown_' . ($self->get_nof_contigs() + 1);
413 $contig->id($contigID);
414 $self->warn("Attributing ID $contigID to unnamed Bio::Assembly::Contig".
415 " object.");
418 # Adding contig to scaffold
419 $self->warn("Replacing contig $contigID with a new contig object")
420 if (exists $self->{'_contigs'}{$contigID});
421 $self->{'_contigs'}{$contigID} = $contig;
422 $contig->assembly($self); # weak circular reference
424 # Put contig sequences in the list of sequences belonging to the scaffold
425 foreach my $seqID ($contig->get_seq_ids()) {
426 if (exists $self->{'_seqs'}{$seqID} &&
427 not($self->{'_seqs'}{$seqID} eq $contig) ) {
428 $self->warn( "Sequence $seqID already assigned to object ".
429 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID");
431 $self->{'_seqs'}{$seqID} = $contig;
434 return 1;
437 =head2 add_singlet
439 Title : add_singlet
440 Usage : $assembly->add_singlet($seq)
441 Function: Add a singlet to the assembly
442 Returns : 1 on success
443 Args : a Bio::Assembly::Singlet object
444 order (optional)
446 =cut
448 sub add_singlet {
449 my ($self, $singlet) = @_;
451 # Input check
452 if ( !ref $singlet || ! $singlet->isa('Bio::Assembly::Singlet') ) {
453 $self->throw("Bio::Assembly::Scaffold::add_singlet is unable to process".
454 " non Bio::Assembly::Singlet object [", ref($singlet), "]");
457 # Create and attribute singlet ID
458 my $singletID = $singlet->id();
459 if( !defined $singletID ) {
460 $singletID = 'Unknown_' . ($self->get_nof_singlets() + 1);
461 $singlet->id($singletID);
462 $self->warn("Attributing ID $singletID to unnamed Bio::Assembly::".
463 "Singlet object.");
466 # Adding singlet to scaffold
467 $self->warn("Replacing singlet $singletID with a new singlet object")
468 if (exists $self->{'_singlets'}{$singletID});
469 $self->{'_singlets'}{$singletID} = $singlet;
470 $singlet->assembly($self); # weak circular reference
472 # Put singlet sequence in the list of sequences belonging to the scaffold
473 my $seqID = $singlet->seqref->id();
474 if (exists $self->{'_seqs'}{$seqID} &&
475 not($self->{'_seqs'}{$seqID} eq $singlet) ) {
476 $self->warn( "Sequence $seqID already assigned to object ".
477 $self->{'_seqs'}{$seqID}->id().". Moving to singlet $singletID");
479 $self->{'_seqs'}{$seqID} = $singlet;
481 return 1;
484 =head2 update_seq_list
486 Title : update_seq_list
487 Usage : $assembly->update_seq_list()
488 Function:
490 Synchronizes the assembly registry for sequences in
491 contigs and contig actual aligned sequences content. You
492 probably want to run this after you remove/add a
493 sequence from/to a contig in the assembly.
495 Returns : 1 for success
496 Args : none
498 =cut
500 sub update_seq_list {
501 my $self = shift;
503 $self->{'_seqs'} = {};
505 # Put sequences in contigs in list of sequences belonging to the scaffold
506 foreach my $contig ($self->all_contigs) {
507 my $contigID = $contig->id();
508 foreach my $seqID ($contig->get_seq_ids) {
509 if (exists $self->{'_seqs'}{$seqID} &&
510 not($self->{'_seqs'}{$seqID} eq $contig) ) {
511 $self->warn( "Sequence $seqID already assigned to object ".
512 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID");
514 $self->{'_seqs'}{$seqID} = $contig;
518 # Put singlet sequences in the list of sequences belonging to the scaffold
519 foreach my $singlet ($self->all_singlets) {
520 my $singletID = $singlet->id();
521 my $seqID = $singlet->seqref->id();
522 if (exists $self->{'_seqs'}{$seqID} &&
523 not($self->{'_seqs'}{$seqID} eq $singlet) ) {
524 $self->warn( "Sequence $seqID already assigned to object ".
525 $self->{'_seqs'}{$seqID}->id().". Moving to singlet $singletID");
527 $self->{'_seqs'}{$seqID} = $singlet;
530 return 1;
533 =head2 remove_contigs
535 Title : remove_contigs
536 Usage : $assembly->remove_contigs(1..4)
537 Function: Remove contig from assembly object
538 Returns : an array of removed Bio::Assembly::Contig
539 objects
540 Args : an array of contig IDs
542 See function get_contig_ids() above
544 =cut
546 sub remove_contigs {
547 my ($self, @args) = @_;
549 my @ret = ();
550 foreach my $contigID (@args) {
551 foreach my $seqID ($self->get_contig_by_id($contigID)->get_seq_ids()) {
552 delete $self->{'_seqs'}{$seqID};
554 push(@ret, $self->{'_contigs'}{$contigID});
555 delete $self->{'_contigs'}{$contigID};
558 return @ret;
561 =head2 remove_singlets
563 Title : remove_singlets
564 Usage : $assembly->remove_singlets(@singlet_ids)
565 Function: Remove singlet from assembly object
566 Returns : the Bio::Assembly::Singlet objects removed
567 Args : a list of singlet IDs
569 See function get_singlet_ids() above
571 =cut
573 sub remove_singlets {
574 my ($self,@args) = @_;
576 my @ret = ();
577 foreach my $singletID (@args) {
578 push(@ret,$self->{'_singlets'}{$singletID});
579 delete $self->{'_singlets'}{$singletID};
582 return @ret;
585 =head2 remove_features_collection
587 Title : remove_features_collection
588 Usage : $assembly->remove_features_collection()
589 Function: Removes the collection of features associated to every
590 contig and singlet of the scaffold. This can be useful
591 to save some memory (when contig and singlet features are
592 not needed).
593 Returns : none
594 Argument : none
596 =cut
598 sub remove_features_collection {
599 my ($self) = @_;
600 for my $obj ( $self->all_contigs, $self->all_singlets ) {
601 $obj->remove_features_collection;
603 return;
607 =head1 Contig and singlet selection methods
609 =cut
611 =head2 select_contigs
613 Title : select_contigs
614 Usage : $assembly->select_contigs(@list)
615 Function: Select an array of contigs from the assembly
616 Returns : an array of Bio::Assembly::Contig objects
617 Args : an array of contig ids
619 See function get_contig_ids() above
621 =cut
623 sub select_contigs {
624 my ($self,@args) = @_;
626 my @contigs = ();
627 foreach my $contig (@args) {
628 unless (exists $self->{'_contigs'}{$contig}) {
629 $self->warn("$contig contig not found. Ignoring...");
630 next;
632 push(@contigs, $self->{'_contigs'}{$contig});
635 return @contigs;
638 =head2 select_singlets
640 Title : select_singlets
641 Usage : $assembly->select_singlets(@list)
642 Function: Selects an array of singlets from the assembly
643 Returns : an array of Bio::Assembly::Singlet objects
644 Args : an array of singlet ids
646 See function get_singlet_ids() above
648 =cut
650 sub select_singlets {
651 my ($self,@args) = @_;
653 my @singlets = ();
654 foreach my $singlet (@args) {
655 unless (exists $self->{'_singlets'}{$singlet}) {
656 $self->warn("$singlet singlet not found. Ignoring...");
657 next;
659 push(@singlets, $self->{'_singlets'}{$singlet});
662 return @singlets;
665 =head2 all_contigs
667 Title : all_contigs
668 Usage : my @contigs = $assembly->all_contigs
669 Function:
671 Returns a list of all contigs in this assembly. Contigs
672 are both clusters and alignments of one or more reads,
673 with an associated consensus sequence.
675 Returns : array of Bio::Assembly::Contig (in lexical id order)
676 Args : none
678 =cut
680 sub all_contigs {
681 my ($self) = @_;
683 my @contigs = ();
684 foreach my $contig (sort { $a cmp $b } keys %{ $self->{'_contigs'} }) {
685 push(@contigs, $self->{'_contigs'}{$contig});
688 return @contigs;
691 =head2 all_singlets
693 Title : all_singlets
694 Usage : my @singlets = $assembly->all_singlets
695 Function:
697 Returns a list of all singlets in this assembly.
698 Singlets are isolated reads, without non-vector
699 matches to any other read in the assembly.
701 Returns : array of Bio::Assembly::Singlet objects (in lexical order by id)
702 Args : none
704 =cut
706 sub all_singlets {
707 my ($self) = @_;
709 my @singlets = ();
710 foreach my $singlet (sort { $a cmp $b } keys %{ $self->{'_singlets'} }) {
711 push(@singlets, $self->{'_singlets'}{$singlet});
714 return @singlets;