2 # BioPerl module for interpro
3 # You may distribute this module under the same terms as perl itself
5 # POD documentation - main docs before the code
9 Bio::SeqIO::interpro - InterProScan XML input/output stream
13 # do not call this module directly, use Bio::SeqIO
18 my $io = Bio::SeqIO->new(-format => "interpro",
19 -file => $interpro_file);
21 while (my $seq = $io->next_seq) {
22 # use the Sequence object
27 L<Bio::SeqIO::interpro> will parse Interpro scan XML (version 1.2) and
28 create L<Bio::SeqFeature::Generic> objects based on the contents of the
31 L<Bio::SeqIO::interpro> will also attach the annotation given in the XML
32 file to the L<Bio::SeqFeature::Generic> objects that it creates.
38 User feedback is an integral part of the evolution of this and other
39 Bioperl modules. Send your comments and suggestions preferably to
40 the Bioperl mailing list. Your participation is much appreciated.
42 bioperl-l@bioperl.org - General discussion
43 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
47 Please direct usage questions or support issues to the mailing list:
49 I<bioperl-l@bioperl.org>
51 rather than to the module maintainer directly. Many experienced and
52 reponsive experts will be able look at the problem and quickly
53 address it. Please include a thorough description of the problem
54 with code and data examples if at all possible.
58 Report bugs to the Bioperl bug tracking system to help us keep track
59 of the bugs and their resolution. Bug reports can be submitted via
62 https://github.com/bioperl/bioperl-live/issues
64 =head1 AUTHOR - Jared Fox
66 Email jaredfox@ucla.edu
70 Allen Day allenday@ucla.edu
74 The rest of the documentation details each of the object methods.
75 Internal methods are usually preceded with a _
79 # Let the code begin...
81 package Bio
::SeqIO
::interpro
;
83 use Bio
::SeqFeature
::Generic
;
86 use Bio
::Seq
::SeqFactory
;
87 use Bio
::Annotation
::Collection
;
88 use Bio
::Annotation
::DBLink
;
89 use base
qw(Bio::SeqIO);
91 my $idcounter = {}; # Used to generate unique id values
92 my $nvtoken = ": "; # The token used if a name/value pair has to be stuffed
98 Usage : my $seqobj = $stream->next_seq
99 Function: Retrieves the next sequence from a SeqIO::interpro stream.
100 Returns : A Bio::Seq::RichSeq object
108 my $bioSeq = $self->_sequence_factory->create(-verbose
=>$self->verbose());
110 my $zinc = "(\"zincins\")";
111 my $wing = "\"Winged helix\"";
112 my $finger = "\"zinc finger\"";
114 my $xml_fragment = undef;
115 while(my $line = $self->_readline()){
117 my $where = index($line, $zinc);
118 my $wherefinger = index($line, $finger);
119 my $finishedline = $line;
120 my $wingwhere = index($line, $wing);
122 # the interpro XML is not fully formed, so we need to convert the
123 # extra double quotes and ampersands into appropriate XML character codes
125 my @linearray = split /$zinc/, $line;
126 $finishedline = join ""zincins"", $linearray[0], $linearray[2];
128 if(index($line, "&") > 0){
129 my @linearray = split /&/, $line;
130 $finishedline = join "&", $linearray[0], $linearray[1];
133 my @linearray = split /$wing/, $line;
134 $finishedline = join ""Winged helix"", $linearray[0], $linearray[1];
137 $xml_fragment .= $finishedline;
138 last if $finishedline =~ m!</protein>!;
140 # Match <protein> but not other similar elements like <protein-matches>
141 return unless $xml_fragment =~ /<protein[\s>]/;
143 $self->_parse_xml($xml_fragment);
145 my $dom = $self->_dom;
147 my ($protein_node) = $dom->findnodes('/protein');
148 my @interproNodes = $protein_node->findnodes('/protein/interpro');
149 my @DBNodes = $protein_node->findnodes('/protein/interpro/match');
150 for(my $interpn=0; $interpn<scalar(@interproNodes); $interpn++){
151 my $ipnlevel = join "", "/protein/interpro[", $interpn + 1, "]";
152 my @matchNodes = $protein_node->findnodes($ipnlevel);
153 for(my $match=0; $match<scalar(@matchNodes); $match++){
154 my $matlevel = join "", "/protein/interpro[", $interpn+1, "]/match[",
155 $match+1, "]/location";
156 my @locNodes = $protein_node->findnodes($matlevel);
157 my $class_level = join "", "/protein/interpro[",$interpn+1, "]/classification";
158 my @goNodes = $protein_node->findnodes($class_level);
159 my @seqFeatures = map { Bio
::SeqFeature
::Generic
->new(
160 -start
=> $_->getAttribute('start'),
161 -end
=> $_->getAttribute('end'),
162 -score
=> $_->getAttribute('score'),
163 -source_tag
=> 'IPRscan',
164 -primary_tag
=> 'region',
165 -display_name
=> $interproNodes[$interpn]->getAttribute('name'),
166 -seq_id
=> $protein_node->getAttribute('id') ),
168 foreach my $seqFeature (@seqFeatures){
169 $bioSeq->add_SeqFeature($seqFeature);
171 my $annotation1 = Bio
::Annotation
::DBLink
->new;
172 $annotation1->database($matchNodes[$match]->getAttribute('dbname'));
173 $annotation1->primary_id($matchNodes[$match]->getAttribute('id'));
174 $annotation1->comment($matchNodes[$match]->getAttribute('name'));
175 $seqFeature->annotation->add_Annotation('dblink',$annotation1);
177 my $annotation2 = Bio
::Annotation
::DBLink
->new;
178 $annotation2->database('INTERPRO');
179 $annotation2->primary_id($interproNodes[$interpn]->getAttribute('id'));
180 $annotation2->comment($interproNodes[$interpn]->getAttribute('name'));
181 $seqFeature->annotation->add_Annotation('dblink',$annotation2);
183 # Bug 1908 (enhancement)
184 my $annotation3 = Bio
::Annotation
::DBLink
->new;
185 $annotation3->database($DBNodes[$interpn]->getAttribute('dbname'));
186 $annotation3->primary_id($DBNodes[$interpn]->getAttribute('id'));
187 $annotation3->comment($DBNodes[$interpn]->getAttribute('name'));
188 $seqFeature->annotation->add_Annotation('dblink',$annotation3);
189 # need to put in the go annotation here!
190 foreach my $g (@goNodes)
192 my $goid = $g->getAttribute('id');
193 my $go_annotation = Bio
::Annotation
::DBLink
->new;
194 $go_annotation->database('GO');
195 $go_annotation->primary_id($goid);
196 $go_annotation->comment($goid);
197 $seqFeature->annotation->add_Annotation('dblink', $go_annotation);
202 my $accession = $protein_node->getAttribute('id');
203 my $displayname = $protein_node->getAttribute('name');
204 $bioSeq->accession($accession);
205 $bioSeq->display_name($displayname);
220 my($self,@args) = @_;
222 $self->SUPER::_initialize
(@args);
223 # hash for functions for decoding keys.
224 $self->{'_func_ftunit_hash'} = {};
226 my %param = @args; # From SeqIO.pm
227 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
230 # fast forward to first <protein/> record.
231 while($line = $self->_readline()){
232 # Match <protein> but not other similar elements like <protein-matches>
233 if($line =~ /<protein[\s>]/){
234 $self->_pushback($line);
239 $self->_xml_parser( XML
::DOM
::Parser
->new() );
241 $self->_sequence_factory( Bio
::Seq
::SeqFactory
->new
242 ( -verbose
=> $self->verbose(),
243 -type
=> 'Bio::Seq::RichSeq'))
244 if ( ! defined $self->sequence_factory );
247 =head2 _sequence_factory
249 Title : _sequence_factory
257 sub _sequence_factory
{
261 $self->{'sequence_factory'} = $val if defined($val);
262 return $self->{'sequence_factory'};
279 $self->{'xml_parser'} = $val if defined($val);
280 return $self->{'xml_parser'};
294 my ($self,$xml) = @_;
295 $self->_dom( $self->_xml_parser->parse($xml) );
313 $self->{'dom'} = $val if defined($val);
314 return $self->{'dom'};