Bio::Tools::CodonTable::is_start_codon: check in case of ambiguous codons (#266)
[bioperl-live.git] / lib / Bio / SeqIO / gcg.pm
blobda28db2cdecc1b52b3f6dcae57f8128517219fac
2 # BioPerl module for Bio::SeqIO::gcg
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Ewan Birney <birney@ebi.ac.uk>
7 # and Lincoln Stein <lstein@cshl.org>
9 # Copyright Ewan Birney & Lincoln Stein
11 # You may distribute this module under the same terms as perl itself
13 # _history
14 # October 18, 1999 Largely rewritten by Lincoln Stein
16 # POD documentation - main docs before the code
18 =head1 NAME
20 Bio::SeqIO::gcg - GCG sequence input/output stream
22 =head1 SYNOPSIS
24 Do not use this module directly. Use it via the Bio::SeqIO class.
26 =head1 DESCRIPTION
28 This object can transform Bio::Seq objects to and from GCG flat
29 file databases.
31 =head1 FEEDBACK
33 =head2 Mailing Lists
35 User feedback is an integral part of the evolution of this and other
36 Bioperl modules. Send your comments and suggestions preferably to one
37 of the Bioperl mailing lists. Your participation is much appreciated.
39 bioperl-l@bioperl.org - General discussion
40 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
42 =head2 Support
44 Please direct usage questions or support issues to the mailing list:
46 I<bioperl-l@bioperl.org>
48 rather than to the module maintainer directly. Many experienced and
49 reponsive experts will be able look at the problem and quickly
50 address it. Please include a thorough description of the problem
51 with code and data examples if at all possible.
53 =head2 Reporting Bugs
55 Report bugs to the Bioperl bug tracking system to help us keep track
56 the bugs and their resolution. Bug reports can be submitted via the web:
58 https://github.com/bioperl/bioperl-live/issues
60 =head1 AUTHORS - Ewan Birney & Lincoln Stein
62 Email: E<lt>birney@ebi.ac.ukE<gt>
63 E<lt>lstein@cshl.orgE<gt>
65 =head1 CONTRIBUTORS
67 Jason Stajich, jason@bioperl.org
69 =head1 APPENDIX
71 The rest of the documentation details each of the object
72 methods. Internal methods are usually preceded with a _
74 =cut
76 # Let the code begin...
78 package Bio::SeqIO::gcg;
80 use strict;
82 use Bio::Seq::SeqFactory;
84 use base qw(Bio::SeqIO);
86 sub _initialize {
87 my($self,@args) = @_;
88 $self->SUPER::_initialize(@args);
89 if( ! defined $self->sequence_factory ) {
90 $self->sequence_factory(Bio::Seq::SeqFactory->new
91 (-verbose => $self->verbose(),
92 -type => 'Bio::Seq::RichSeq'));
96 =head2 next_seq
98 Title : next_seq
99 Usage : $seq = $stream->next_seq()
100 Function: returns the next sequence in the stream
101 Returns : Bio::Seq object
102 Args :
104 =cut
106 sub next_seq {
107 my ($self,@args) = @_;
108 my($id,$type,$desc,$line,$chksum,$sequence,$date,$len);
110 while( defined($_ = $self->_readline()) ) {
112 ## Get the descriptive info (anything before the line with '..')
113 unless( /\.\.$/ ) { $desc.= $_; }
114 ## Pull ID, Checksum & Type from the line containing '..'
115 /\.\.$/ && do { $line = $_; chomp;
116 if(/Check\:\s(\d+)\s/) { $chksum = $1; }
117 if(/Type:\s(\w)\s/) { $type = $1; }
118 if(/(\S+)\s+Length/)
119 { $id = $1; }
120 if(/Length:\s+(\d+)\s+(\S.+\S)\s+Type/ )
121 { $len = $1; $date = $2;}
122 last;
125 return if ( !defined $_);
126 chomp($desc); # remove last "\n"
128 while( defined($_ = $self->_readline()) ) {
130 ## This is where we grab the sequence info.
132 if( /\.\.$/ ) {
133 $self->throw("Looks like start of another sequence. See documentation. ");
136 next if($_ eq "\n"); ## skip whitespace lines in formatted seq
137 s/[\d\s\t]//g; ## remove anything that is not alphabet char: preserve anything that is not explicitly specified for removal (Stefan Kirov)
138 # $_ = uc($_); ## uppercase sequence: NO. Keep the case. HL
139 $sequence .= $_;
141 ##If we parsed out a checksum, we might as well test it
143 if(defined $chksum) {
144 unless(_validate_checksum(uc($sequence),$chksum)) {
145 $self->throw("Checksum failure on parsed sequence.");
149 ## Remove whitespace from identifier because the constructor
150 ## will throw a warning otherwise...
151 if(defined $id) { $id =~ s/\s+//g;}
153 ## Turn our parsed "Type: N" or "Type: P" (if found) into the appropriate
154 ## keyword that the constructor expects...
155 if(defined $type) {
156 if($type eq "N") { $type = "dna"; }
157 if($type eq "P") { $type = "prot"; }
160 return $self->sequence_factory->create(-seq => $sequence,
161 -id => $id,
162 -desc => $desc,
163 -type => $type,
164 -dates => [ $date ]
168 =head2 write_seq
170 Title : write_seq
171 Usage : $stream->write_seq(@seq)
172 Function: writes the formatted $seq object into the stream
173 Returns : 1 for success and 0 for error
174 Args : array of Bio::PrimarySeqI object
177 =cut
179 sub write_seq {
180 my ($self,@seq) = @_;
181 for my $seq (@seq) {
182 $self->throw("Did not provide a valid Bio::PrimarySeqI object")
183 unless defined $seq && ref($seq) && $seq->isa('Bio::PrimarySeqI');
185 $self->warn("No whitespace allowed in GCG ID [". $seq->display_id. "]")
186 if $seq->display_id =~ /\s/;
188 my $str = $seq->seq;
189 my $comment = $seq->desc || '';
190 my $id = $seq->id;
191 my $type = ( $seq->alphabet() =~ /[dr]na/i ) ? 'N' : 'P';
192 my $timestamp;
194 if( $seq->can('get_dates') ) {
195 ($timestamp) = $seq->get_dates;
196 } else {
197 $timestamp = localtime(time);
199 my($sum,$offset,$len,$i,$j,$cnt,@out);
201 $len = length($str);
202 ## Set the offset if we have any non-standard numbering going on
203 $offset=1;
204 # checksum
205 $sum = $self->GCG_checksum($seq);
207 #Output the sequence header info
208 push(@out,"$comment\n");
209 push(@out,"$id Length: $len $timestamp Type: $type Check: $sum ..\n\n");
211 #Format the sequence
212 $i = $#out + 1;
213 for($j = 0 ; $j < $len ; ) {
214 if( $j % 50 == 0) {
215 $out[$i] = sprintf("%8d ",($j+$offset)); #numbering
217 $out[$i] .= sprintf("%s",substr($str,$j,10));
218 $j += 10;
219 if( $j < $len && $j % 50 != 0 ) {
220 $out[$i] .= " ";
221 }elsif($j % 50 == 0 ) {
222 $out[$i++] .= "\n\n";
225 local($^W) = 0;
226 if($j % 50 != 0 ) {
227 $out[$i] .= "\n";
229 $out[$i] .= "\n";
230 return unless $self->_print(@out);
233 $self->flush if $self->_flush_on_write && defined $self->_fh;
234 return 1;
237 =head2 GCG_checksum
239 Title : GCG_checksum
240 Usage : $cksum = $gcgio->GCG_checksum($seq);
241 Function : returns a gcg checksum for the sequence specified
243 This method can also be called as a class method.
244 Example :
245 Returns : a GCG checksum string
246 Argument : a Bio::PrimarySeqI implementing object
248 =cut
250 sub GCG_checksum {
251 my ($self,$seqobj) = @_;
252 my $index = 0;
253 my $checksum = 0;
254 my $char;
256 my $seq = $seqobj->seq();
257 $seq =~ tr/a-z/A-Z/;
259 foreach $char ( split(/[\.\-]*/, $seq)) {
260 $index++;
261 $checksum += ($index * (unpack("c",$char) || 0) );
262 if( $index == 57 ) {
263 $index = 0;
267 return ($checksum % 10000);
270 =head2 _validate_checksum
272 Title : _validate_checksum
273 Usage : n/a - internal method
274 Function: if parsed gcg sequence contains a checksum field
275 : we compare it to a value computed here on the parsed
276 : sequence. A checksum mismatch would indicate some
277 : type of parsing failure occurred.
279 Returns : 1 for success, 0 for failure
280 Args : string containing parsed seq, value of parsed checksum
283 =cut
285 sub _validate_checksum {
286 my($seq,$parsed_sum) = @_;
287 my($i,$len,$computed_sum,$cnt);
289 $len = length($seq);
291 #Generate the GCG Checksum value
293 for($i=0; $i<$len ;$i++) {
294 $cnt++;
295 $computed_sum += $cnt * ord(substr($seq,$i,1));
296 ($cnt == 57) && ($cnt=0);
298 $computed_sum %= 10000;
300 ## Compare and decide if success or failure
302 if($parsed_sum == $computed_sum) {
303 return 1;
304 } else { return 0; }