2 # BioPerl module for Bio::Assembly::Scaffold
4 # Copyright by Robson F. de Souza
6 # You may distribute this module under the same terms as perl itself
8 # POD documentation - main docs before the code
12 Bio::Assembly::Scaffold - Perl module to hold and manipulate sequence assembly
18 use Bio::Assembly::IO;
20 # Assembly loading methods
21 my $aio = Bio::Assembly::IO->new(-file=>"test.ace.1", -format=>'phrap');
22 my $assembly = $aio->next_assembly;
24 foreach my $contig ($assembly->all_contigs) {
25 # do something... (see Bio::Assembly::Contig)
30 Bio::Assembly::Scaffold was developed to store and manipulate data
31 from sequence assembly programs like Phrap. It implements the
32 ScaffoldI interface and intends to be generic enough to be used by
33 Bio::Assembly::IO drivers written to programs other than Phrap.
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to the
41 Bioperl mailing lists Your participation is much appreciated.
43 bioperl-l@bioperl.org - General discussion
44 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
48 Please direct usage questions or support issues to the mailing list:
50 I<bioperl-l@bioperl.org>
52 rather than to the module maintainer directly. Many experienced and
53 reponsive experts will be able look at the problem and quickly
54 address it. Please include a thorough description of the problem
55 with code and data examples if at all possible.
59 Report bugs to the Bioperl bug tracking system to help us keep track
60 the bugs and their resolution. Bug reports can be submitted via the
63 https://github.com/bioperl/bioperl-live/issues
65 =head1 AUTHOR - Robson Francisco de Souza
67 rfsouza@citri.iq.usp.br
71 The rest of the documentation details each of the object
72 methods. Internal methods are usually preceded with a _
76 package Bio
::Assembly
::Scaffold
;
79 use Bio
::Annotation
::Collection
;
81 use base
qw(Bio::Root::Root Bio::Assembly::ScaffoldI);
86 Usage : $scaffold = new ( -id => "assembly 1",
87 -source => 'program_name',
88 -contigs => \@contigs,
89 -singlets => \@singlets );
90 Function: creates a new scaffold object
91 Returns : Bio::Assembly::Scaffold
92 Args : -id : [string] scaffold name
93 -source : [string] sequence assembly program
94 -contigs : reference to array of Bio::Assembly::Contig objects
95 -singlets : reference to array of Bio::Assembly::Singlet objects
101 my($class, @args) = @_;
102 my $self = $class->SUPER::new
(@args);
103 my ($id, $src, $contigs, $singlets) = $self->_rearrange(
104 [qw(ID SOURCE CONTIGS SINGLETS)], @args);
107 $self->{'_id'} = 'NoName';
108 $self->{'_source'} = 'Unknown';
109 $self->{'_contigs'} = {};
110 $self->{'_singlets'} = {};
111 $self->{'_seqs'} = {};
112 $self->{'_annotation'} = Bio
::Annotation
::Collection
->new();
115 $self->{'_id'} = $id if (defined $id);
116 $self->{'_source'} = $src if (defined $src);
118 # Add contigs and singlets to scaffold
119 if (defined $contigs && ref($contigs = 'ARRAY')) {
120 for my $contig (@
{$contigs}) {
121 if( ! ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) {
122 $self->throw("Bio::Assembly::Scaffold::new is unable to process non".
123 "Bio::Assembly::Contig object [", ref($contig), "]");
125 $self->add_contig($contig);
128 if (defined $singlets && ref($singlets = 'ARRAY')) {
129 for my $singlet (@
{$singlets}) {
130 if( ! ref $singlet || ! $singlet->isa('Bio::Assembly::Singlet') ) {
131 $self->throw("Bio::Assembly::Scaffold::new is unable to process non".
132 "Bio::Assembly::Singlet object [", ref($singlet), "]");
134 $self->add_singlet($singlet);
141 =head1 Accessing general assembly data
148 Usage : $assembly->id()
149 Function: Get/Set assembly ID
150 Returns : string or undef
156 my ($self, $id) = @_;
157 return $self->{'_id'} = $id if (defined $id);
158 return $self->{'_id'};
164 Usage : $assembly->annotation()
165 Function: Get/Set assembly annotation object
166 Returns : Bio::Annotation::Collection
172 my ($self, $ref) = shift;
173 $self->{'_annotation'} = $ref if (defined $ref);
174 return $self->{'_annotation'};
177 =head2 get_nof_contigs
179 Title : get_nof_contigs
180 Usage : $assembly->get_nof_contigs()
181 Function: Get the number of contigs included in the scaffold
187 sub get_nof_contigs
{
189 return scalar( $self->get_contig_ids() );
192 =head2 get_nof_contig_seqs
194 Title : get_nof_contig_seqs
195 Usage : $assembly->get_nof_contig_seqs()
196 Function: Get the number of sequences included in contigs of the
197 scaffold (no consensus sequences or singlets)
203 sub get_nof_contig_seqs
{
207 foreach my $contig ($self->all_contigs) {
208 $nof_seqs += scalar( $contig->get_seq_ids() );
213 # function alias for backward compatibility
214 *get_nof_sequences_in_contigs
= \
&get_nof_contig_seqs
;
217 =head2 get_nof_singlets (get_nof_singlet_seqs)
220 Usage : $assembly->nof_singlets()
221 Function: Get the number of singlets included in the assembly
227 sub get_nof_singlets
{
229 return scalar( $self->get_singlet_ids() );
231 *get_nof_singlet_seqs
= \
&get_nof_singlets
;
233 =head2 get_all_seq_ids
235 Title : get_all_seq_ids
236 Usage : $assembly->get_all_seq_ids()
237 Function: Get the ID of all sequences making up the scaffold
238 (sequences from contigs and singlets, not consensus).
239 Returns : array of strings
244 sub get_all_seq_ids
{
246 return keys %{ $self->{'_seqs'} };
252 Usage : $assembly->get_nof_seqs()
253 Function: Get total number of sequences making up the scaffold
254 (sequences from contigs and singlets, not consensus).
262 return scalar $self->get_all_seq_ids;
265 =head2 get_contig_seq_ids
267 Title : get_contig_seq_ids
268 Usage : $assembly->get_contig_seq_ids()
269 Function: Get the ID of all sequences in contigs
270 Returns : array of strings
275 sub get_contig_seq_ids
{
278 for my $contig ( $self->all_contigs ) {
279 push @ids, $contig->get_seq_ids;
283 # function alias for backward compatibility
284 *get_seq_ids
= \
&get_contig_seq_ids
;
286 =head2 get_contig_ids
288 Title : get_contig_ids
289 Usage : $assembly->get_contig_ids()
290 Function: Access list of contig IDs from assembly
291 Returns : an array, if there are any contigs in the
292 assembly. An empty array otherwise
301 ?
sort keys %{$self->{'_contigs'}}
302 : scalar keys %{$self->{'_contigs'}};
305 =head2 get_singlet_ids (get_singlet_seq_ids)
307 Title : get_singlet_ids
308 Usage : $assembly->get_singlet_ids()
309 Function: Access list of singlet IDs from assembly
310 Returns : array of strings if there are any singlets
311 otherwise an empty array
316 sub get_singlet_ids
{
320 ?
sort keys %{$self->{'_singlets'}}
321 : scalar keys %{$self->{'_singlets'}};
323 *get_singlet_seq_ids
= \
&get_singlet_ids
;
327 Title : get_seq_by_id
328 Usage : $assembly->get_seq_by_id($id)
329 Function: Get a reference for an sequence making up the scaffold
330 (from a contig or singlet, not consensus)
331 Returns : a Bio::LocatableSeq object
332 undef if sequence $id is not found in the scaffold
333 Args : [string] sequence identifier (id)
341 return unless (exists $self->{'_seqs'}{$seqID});
343 return $self->{'_seqs'}{$seqID}->get_seq_by_name($seqID);
346 =head2 get_contig_by_id
348 Title : get_contig_by_id
349 Usage : $assembly->get_contig_by_id($id)
350 Function: Get a reference for a contig
351 Returns : a Bio::Assembly::Contig object or undef
352 Args : [string] contig unique identifier (ID)
356 sub get_contig_by_id
{
358 my $contigID = shift;
360 return unless (exists $self->{'_contigs'}{$contigID});
362 return $self->{'_contigs'}{$contigID};
365 =head2 get_singlet_by_id
367 Title : get_singlet_by_id
368 Usage : $assembly->get_singlet_by_id()
369 Function: Get a reference for a singlet
370 Returns : Bio::Assembly::Singlet object or undef
371 Args : [string] a singlet ID
375 sub get_singlet_by_id
{
378 my $singletID = shift;
380 return unless (exists $self->{'_singlets'}{$singletID});
382 return $self->{'_singlets'}{$singletID};
385 =head1 Modifier methods
392 Usage : $assembly->add_contig($contig)
393 Function: Add a contig to the assembly
394 Returns : 1 on success
395 Args : a Bio::Assembly::Contig object
401 my ($self, $contig) = @_;
404 if( !ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) {
405 $self->throw("Bio::Assembly::Scaffold::add_contig is unable to process".
406 " non Bio::Assembly::Contig object [", ref($contig), "]");
409 # Create and attribute contig ID
410 my $contigID = $contig->id();
411 if( !defined $contigID ) {
412 $contigID = 'Unknown_' . ($self->get_nof_contigs() + 1);
413 $contig->id($contigID);
414 $self->warn("Attributing ID $contigID to unnamed Bio::Assembly::Contig".
418 # Adding contig to scaffold
419 $self->warn("Replacing contig $contigID with a new contig object")
420 if (exists $self->{'_contigs'}{$contigID});
421 $self->{'_contigs'}{$contigID} = $contig;
422 $contig->assembly($self); # weak circular reference
424 # Put contig sequences in the list of sequences belonging to the scaffold
425 foreach my $seqID ($contig->get_seq_ids()) {
426 if (exists $self->{'_seqs'}{$seqID} &&
427 not($self->{'_seqs'}{$seqID} eq $contig) ) {
428 $self->warn( "Sequence $seqID already assigned to object ".
429 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID");
431 $self->{'_seqs'}{$seqID} = $contig;
440 Usage : $assembly->add_singlet($seq)
441 Function: Add a singlet to the assembly
442 Returns : 1 on success
443 Args : a Bio::Assembly::Singlet object
449 my ($self, $singlet) = @_;
452 if ( !ref $singlet || ! $singlet->isa('Bio::Assembly::Singlet') ) {
453 $self->throw("Bio::Assembly::Scaffold::add_singlet is unable to process".
454 " non Bio::Assembly::Singlet object [", ref($singlet), "]");
457 # Create and attribute singlet ID
458 my $singletID = $singlet->id();
459 if( !defined $singletID ) {
460 $singletID = 'Unknown_' . ($self->get_nof_singlets() + 1);
461 $singlet->id($singletID);
462 $self->warn("Attributing ID $singletID to unnamed Bio::Assembly::".
466 # Adding singlet to scaffold
467 $self->warn("Replacing singlet $singletID with a new singlet object")
468 if (exists $self->{'_singlets'}{$singletID});
469 $self->{'_singlets'}{$singletID} = $singlet;
470 $singlet->assembly($self); # weak circular reference
472 # Put singlet sequence in the list of sequences belonging to the scaffold
473 my $seqID = $singlet->seqref->id();
474 if (exists $self->{'_seqs'}{$seqID} &&
475 not($self->{'_seqs'}{$seqID} eq $singlet) ) {
476 $self->warn( "Sequence $seqID already assigned to object ".
477 $self->{'_seqs'}{$seqID}->id().". Moving to singlet $singletID");
479 $self->{'_seqs'}{$seqID} = $singlet;
484 =head2 update_seq_list
486 Title : update_seq_list
487 Usage : $assembly->update_seq_list()
490 Synchronizes the assembly registry for sequences in
491 contigs and contig actual aligned sequences content. You
492 probably want to run this after you remove/add a
493 sequence from/to a contig in the assembly.
495 Returns : 1 for success
500 sub update_seq_list
{
503 $self->{'_seqs'} = {};
505 # Put sequences in contigs in list of sequences belonging to the scaffold
506 foreach my $contig ($self->all_contigs) {
507 my $contigID = $contig->id();
508 foreach my $seqID ($contig->get_seq_ids) {
509 if (exists $self->{'_seqs'}{$seqID} &&
510 not($self->{'_seqs'}{$seqID} eq $contig) ) {
511 $self->warn( "Sequence $seqID already assigned to object ".
512 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID");
514 $self->{'_seqs'}{$seqID} = $contig;
518 # Put singlet sequences in the list of sequences belonging to the scaffold
519 foreach my $singlet ($self->all_singlets) {
520 my $singletID = $singlet->id();
521 my $seqID = $singlet->seqref->id();
522 if (exists $self->{'_seqs'}{$seqID} &&
523 not($self->{'_seqs'}{$seqID} eq $singlet) ) {
524 $self->warn( "Sequence $seqID already assigned to object ".
525 $self->{'_seqs'}{$seqID}->id().". Moving to singlet $singletID");
527 $self->{'_seqs'}{$seqID} = $singlet;
533 =head2 remove_contigs
535 Title : remove_contigs
536 Usage : $assembly->remove_contigs(1..4)
537 Function: Remove contig from assembly object
538 Returns : an array of removed Bio::Assembly::Contig
540 Args : an array of contig IDs
542 See function get_contig_ids() above
547 my ($self, @args) = @_;
550 foreach my $contigID (@args) {
551 foreach my $seqID ($self->get_contig_by_id($contigID)->get_seq_ids()) {
552 delete $self->{'_seqs'}{$seqID};
554 push(@ret, $self->{'_contigs'}{$contigID});
555 delete $self->{'_contigs'}{$contigID};
561 =head2 remove_singlets
563 Title : remove_singlets
564 Usage : $assembly->remove_singlets(@singlet_ids)
565 Function: Remove singlet from assembly object
566 Returns : the Bio::Assembly::Singlet objects removed
567 Args : a list of singlet IDs
569 See function get_singlet_ids() above
573 sub remove_singlets
{
574 my ($self,@args) = @_;
577 foreach my $singletID (@args) {
578 push(@ret,$self->{'_singlets'}{$singletID});
579 delete $self->{'_singlets'}{$singletID};
585 =head2 remove_features_collection
587 Title : remove_features_collection
588 Usage : $assembly->remove_features_collection()
589 Function: Removes the collection of features associated to every
590 contig and singlet of the scaffold. This can be useful
591 to save some memory (when contig and singlet features are
598 sub remove_features_collection
{
600 for my $obj ( $self->all_contigs, $self->all_singlets ) {
601 $obj->remove_features_collection;
607 =head1 Contig and singlet selection methods
611 =head2 select_contigs
613 Title : select_contigs
614 Usage : $assembly->select_contigs(@list)
615 Function: Select an array of contigs from the assembly
616 Returns : an array of Bio::Assembly::Contig objects
617 Args : an array of contig ids
619 See function get_contig_ids() above
624 my ($self,@args) = @_;
627 foreach my $contig (@args) {
628 unless (exists $self->{'_contigs'}{$contig}) {
629 $self->warn("$contig contig not found. Ignoring...");
632 push(@contigs, $self->{'_contigs'}{$contig});
638 =head2 select_singlets
640 Title : select_singlets
641 Usage : $assembly->select_singlets(@list)
642 Function: Selects an array of singlets from the assembly
643 Returns : an array of Bio::Assembly::Singlet objects
644 Args : an array of singlet ids
646 See function get_singlet_ids() above
650 sub select_singlets
{
651 my ($self,@args) = @_;
654 foreach my $singlet (@args) {
655 unless (exists $self->{'_singlets'}{$singlet}) {
656 $self->warn("$singlet singlet not found. Ignoring...");
659 push(@singlets, $self->{'_singlets'}{$singlet});
668 Usage : my @contigs = $assembly->all_contigs
671 Returns a list of all contigs in this assembly. Contigs
672 are both clusters and alignments of one or more reads,
673 with an associated consensus sequence.
675 Returns : array of Bio::Assembly::Contig (in lexical id order)
684 foreach my $contig (sort { $a cmp $b } keys %{ $self->{'_contigs'} }) {
685 push(@contigs, $self->{'_contigs'}{$contig});
694 Usage : my @singlets = $assembly->all_singlets
697 Returns a list of all singlets in this assembly.
698 Singlets are isolated reads, without non-vector
699 matches to any other read in the assembly.
701 Returns : array of Bio::Assembly::Singlet objects (in lexical order by id)
710 foreach my $singlet (sort { $a cmp $b } keys %{ $self->{'_singlets'} }) {
711 push(@singlets, $self->{'_singlets'}{$singlet});