Bio::DB::TFBS namespace has been moved to its own distribution named after itself
[bioperl-live.git] / Bio / DB / SwissProt.pm
blob9c14684f9061683f0917bfad7267302aed3b7a91
3 # BioPerl module for Bio::DB::SwissProt
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Jason Stajich <jason@bioperl.org>
9 # Copyright Jason Stajich
11 # You may distribute this module under the same terms as perl itself
13 # POD documentation - main docs before the code
14 # Reworked to use Bio::DB::WebDBSeqI 2000-12-11
16 =head1 NAME
18 Bio::DB::SwissProt - Database object interface to SwissProt retrieval
20 =head1 SYNOPSIS
22 use Bio::DB::SwissProt;
24 $sp = Bio::DB::SwissProt->new();
26 $seq = $sp->get_Seq_by_id('KPY1_ECOLI'); # SwissProt ID
27 # <4-letter-identifier>_<species 5-letter code>
28 # or ...
29 $seq = $sp->get_Seq_by_acc('P43780'); # SwissProt AC
30 # [OPQ]xxxxx
33 # In fact in this implementation
34 # these methods call the same webscript so you can use
35 # then interchangeably
37 # choose a different server to query
38 $sp = Bio::DB::SwissProt->new('-servertype' => 'expasy',
39 '-hostlocation' => 'us');
41 $seq = $sp->get_Seq_by_id('BOLA_HAEIN'); # SwissProtID
43 =head1 DESCRIPTION
45 SwissProt is a curated database of proteins managed by the Swiss
46 Bioinformatics Institute. Additional tools for
47 parsing and manipulating swissprot files can be found at
48 ftp://ftp.ebi.ac.uk/pub/software/swissprot/Swissknife/.
50 Allows the dynamic retrieval of Sequence objects (Bio::Seq) from the
51 SwissProt database via an Expasy retrieval.
53 In order to make changes transparent we have host type (currently only
54 expasy) and location (default to Switzerland) separated out. This
55 allows the user to pick the closest Expasy mirror for running their
56 queries.
59 =head1 FEEDBACK
61 =head2 Mailing Lists
63 User feedback is an integral part of the evolution of this and other
64 Bioperl modules. Send your comments and suggestions preferably to one
65 of the Bioperl mailing lists. Your participation is much appreciated.
68 bioperl-l@bioperl.org - General discussion
69 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
71 =head2 Support
73 Please direct usage questions or support issues to the mailing list:
75 I<bioperl-l@bioperl.org>
77 rather than to the module maintainer directly. Many experienced and
78 reponsive experts will be able look at the problem and quickly
79 address it. Please include a thorough description of the problem
80 with code and data examples if at all possible.
82 =head2 Reporting Bugs
84 Report bugs to the Bioperl bug tracking system to help us keep track
85 the bugs and their resolution. Bug reports can be submitted via the
86 web:
88 https://github.com/bioperl/bioperl-live/issues
90 =head1 AUTHOR - Jason Stajich
92 Email Jason Stajich E<lt>jason@bioperl.org E<lt>
94 Thanks go to Alexandre Gattiker E<lt>gattiker@isb-sib.chE<gt> of Swiss
95 Institute of Bioinformatics for helping point us in the direction of
96 the correct expasy scripts and for swissknife references.
98 Also thanks to Heikki Lehvaslaiho E<lt>heikki-at-bioperl-dot-orgE<gt>
99 for help with adding EBI swall server.
101 =head1 APPENDIX
103 The rest of the documentation details each of the object
104 methods. Internal methods are usually preceded with a _
106 =cut
108 # Let the code begin...
110 package Bio::DB::SwissProt;
111 use strict;
113 use HTTP::Request::Common;
114 our $MODVERSION = '0.8.1';
116 use base qw(Bio::DB::WebDBSeqI);
118 # global vars
119 our $DEFAULTSERVERTYPE = 'ebi';
120 our $DEFAULTFORMAT = 'swissprot';
121 # our $DEFAULTIDTRACKER = 'http://www.expasy.ch';
123 # you can add your own here theoretically.
124 our %HOSTS = (
125 'expasy' => {
126 'default' => 'us',
127 'baseurl' => 'http://%s/cgi-bin/sprot-retrieve-list.pl',
128 'hosts' =>
130 'switzerland' => 'ch.expasy.org',
131 'canada' => 'ca.expasy.org',
132 'china' => 'cn.expasy.org',
133 'taiwan' => 'tw.expasy.org',
134 'australia' => 'au.expasy.org',
135 'korea' => 'kr.expasy.org',
136 'us' => 'us.expasy.org',
138 # ick, CGI variables
139 'jointype' => ' ',
140 'idvar' => 'list',
141 'basevars' => [ ],
143 'ebi' => {
144 'default' => 'uk',
145 'baseurl' => 'http://%s/Tools/dbfetch/dbfetch',
146 'hosts' => {
147 'uk' => 'www.ebi.ac.uk',
149 'jointype' => ',',
150 'idvar' => 'id',
151 'basevars' => [ 'db' => 'UniProtKB',
152 'style' => 'raw' ],
156 our %ID_MAPPING_DATABASES = map {$_ => 1} qw(
157 ACC+ID ACC ID UPARC NF50 NF90 NF100 EMBL_ID EMBL PIR UNIGENE_ID P_ENTREZGENEID
158 P_GI P_IPI P_REFSEQ_AC PDB_ID DISPROT_ID HSSP_ID DIP_ID MEROPS_ID PEROXIBASE_ID
159 PPTASEDB_ID REBASE_ID TCDB_ID 2DBASE_ECOLI_ID AARHUS_GHENT_2DPAGE_ID
160 ANU_2DPAGE_ID DOSAC_COBS_2DPAGE_ID ECO2DBASE_ID WORLD_2DPAGE_ID ENSEMBL_ID
161 ENSEMBL_PRO_ID ENSEMBL_TRS_ID P_ENTREZGENEID GENOMEREVIEWS_ID KEGG_ID TIGR_ID
162 UCSC_ID VECTORBASE_ID AGD_ID ARACHNOSERVER_ID BURULIST_ID CGD CYGD_ID
163 DICTYBASE_ID ECHOBASE_ID ECOGENE_ID EUHCVDB_ID FLYBASE_ID GENECARDS_ID
164 GENEDB_SPOMBE_ID GENEFARM_ID H_INVDB_ID HGNC_ID HPA_ID LEGIOLIST_ID LEPROMA_ID
165 LISTILIST_ID MAIZEGDB_ID MIM_ID MGI_ID MYPULIST_ID NMPDR ORPHANET_ID PHARMGKB_ID
166 PHOTOLIST_ID PSEUDOCAP_ID RGD_ID SAGALIST_ID SGD_ID SUBTILIST_ID TAIR_ID
167 TUBERCULIST_ID WORMBASE_ID WORMPEP_ID XENBASE_ID ZFIN_ID EGGNOG_ID OMA_ID
168 ORTHODB_ID BIOCYC_ID REACTOME_ID CLEANEX_ID GERMONLINE_ID DRUGBANK_ID
169 NEXTBIO_ID);
171 # new modules should be a little more lightweight and
172 # should use Bio::Root::Root
173 sub new {
174 my ($class, @args) = @_;
175 my $self = $class->SUPER::new(@args);
177 my ($format, $hostlocation,$servertype) =
178 $self->_rearrange([qw(FORMAT HOSTLOCATION SERVERTYPE)],
179 @args);
181 if( $format && $format !~ /(swiss)|(fasta)/i ) {
182 $self->warn("Requested Format $format is ignored because only SwissProt and Fasta formats are currently supported");
183 $format = $self->default_format;
185 $servertype = $DEFAULTSERVERTYPE unless $servertype;
186 $servertype = lc $servertype;
187 $self->servertype($servertype);
188 if ( $hostlocation ) {
189 $self->hostlocation(lc $hostlocation);
192 $self->request_format($format); # let's always override the format, as it must be swiss or fasta
193 return $self;
196 =head2 Routines from Bio::DB::RandomAccessI
198 =cut
200 =head2 get_Seq_by_id
202 Title : get_Seq_by_id
203 Usage : $seq = $db->get_Seq_by_id('ROA1_HUMAN')
204 Function: Gets a Bio::Seq object by its name
205 Returns : a Bio::Seq object
206 Args : the id (as a string) of a sequence
207 Throws : "id does not exist" exception
209 =cut
211 =head2 get_Seq_by_acc
213 Title : get_Seq_by_acc
214 Usage : $seq = $db->get_Seq_by_acc('X77802');
215 Function: Gets a Bio::Seq object by accession number
216 Returns : A Bio::Seq object
217 Args : accession number (as a string)
218 Throws : "acc does not exist" exception
220 =cut
222 =head2 get_Stream_by_id
224 Title : get_Stream_by_id
225 Usage : $stream = $db->get_Stream_by_id( [$uid1, $uid2] );
226 Function: Gets a series of Seq objects by unique identifiers
227 Returns : a Bio::SeqIO stream object
228 Args : $ref : a reference to an array of unique identifiers for
229 the desired sequence entries
231 =cut
233 =head2 get_Stream_by_acc
235 Title : get_Stream_by_acc
236 Usage : $seq = $db->get_Seq_by_acc([$acc1, $acc2]);
237 Function: Gets a series of Seq objects by accession numbers
238 Returns : a Bio::SeqIO stream object
239 Args : $ref : a reference to an array of accession numbers for
240 the desired sequence entries
241 Note : For GenBank, this just calls the same code for get_Stream_by_id()
243 =cut
245 =head2 get_Stream_by_batch
247 Title : get_Stream_by_batch
248 Usage : $seq = $db->get_Stream_by_batch($ref);
249 Function: Retrieves Seq objects from SwissProt 'en masse', rather than one
250 at a time. This is implemented the same way as get_Stream_by_id,
251 but is provided here in keeping with access methods of NCBI
252 modules.
253 Example :
254 Returns : a Bio::SeqIO stream object
255 Args : $ref : either an array reference, a filename, or a filehandle
256 from which to get the list of unique ids/accession numbers.
258 NOTE: deprecated API. Use get_Stream_by_id() instead.
260 =cut
262 *get_Stream_by_batch = sub {
263 my $self = shift;
264 $self->deprecated('get_Stream_by_batch() is deprecated; use get_Stream_by_id() instead');
265 $self->get_Stream_by_id(@_)
268 =head2 Implemented Routines from Bio::DB::WebDBSeqI interface
270 =cut
272 =head2 get_request
274 Title : get_request
275 Usage : my $url = $self->get_request
276 Function: returns a HTTP::Request object
277 Returns :
278 Args : %qualifiers = a hash of qualifiers (ids, format, etc)
280 =cut
282 sub get_request {
283 my ($self, @qualifiers) = @_;
284 my ($uids, $format) = $self->_rearrange([qw(UIDS FORMAT)],
285 @qualifiers);
287 if( !defined $uids ) {
288 $self->throw("Must specify a value for uids to query");
290 my ($f,undef) = $self->request_format($format);
292 my %vars = (
293 @{$HOSTS{$self->servertype}->{'basevars'}},
294 ( 'format' => $f )
297 my $url = $self->location_url;
299 my $uid;
300 my $jointype = $HOSTS{$self->servertype}->{'jointype'} || ' ';
301 my $idvar = $HOSTS{$self->servertype}->{'idvar'} || 'id';
303 if( ref($uids) =~ /ARRAY/i ) {
304 # HTTP::Request automagically converts the ' ' to %20
305 $uid = join($jointype, @$uids);
306 } else {
307 $uid = $uids;
309 $vars{$idvar} = $uid;
311 return POST $url, \%vars;
314 =head2 postprocess_data
316 Title : postprocess_data
317 Usage : $self->postprocess_data ( 'type' => 'string',
318 'location' => \$datastr);
319 Function: process downloaded data before loading into a Bio::SeqIO
320 Returns : void
321 Args : hash with two keys - 'type' can be 'string' or 'file'
322 - 'location' either file location or string
323 reference containing data
325 =cut
327 # don't need to do anything
329 sub postprocess_data {
330 my ($self, %args) = @_;
331 return;
334 =head2 default_format
336 Title : default_format
337 Usage : my $format = $self->default_format
338 Function: Returns default sequence format for this module
339 Returns : string
340 Args : none
342 =cut
344 sub default_format {
345 return $DEFAULTFORMAT;
348 =head2 Bio::DB::SwissProt specific routines
350 =cut
352 =head2 servertype
354 Title : servertype
355 Usage : my $servertype = $self->servertype
356 $self->servertype($servertype);
357 Function: Get/Set server type
358 Returns : string
359 Args : server type string [optional]
361 =cut
363 sub servertype {
364 my ($self, $servertype) = @_;
365 if( defined $servertype && $servertype ne '') {
366 $self->throw("You gave an invalid server type ($servertype)".
367 " - available types are ".
368 keys %HOSTS) unless( $HOSTS{$servertype} );
369 $self->{'_servertype'} = $servertype;
370 $self->{'_hostlocation'} = $HOSTS{$servertype}->{'default'};
372 # make sure format is reset properly in that different
373 # servers have different syntaxes
374 my ($existingformat,$seqioformat) = $self->request_format;
375 $self->request_format($existingformat);
377 return $self->{'_servertype'} || $DEFAULTSERVERTYPE;
381 =head2 hostlocation
383 Title : hostlocation
384 Usage : my $location = $self->hostlocation()
385 $self->hostlocation($location)
386 Function: Set/Get Hostlocation
387 Returns : string representing hostlocation
388 Args : string specifying hostlocation [optional]
390 =cut
392 sub hostlocation {
393 my ($self, $location ) = @_;
394 my $servertype = $self->servertype;
395 $self->throw("Must have a valid servertype defined not $servertype")
396 unless defined $servertype;
397 my %hosts = %{$HOSTS{$servertype}->{'hosts'}};
398 if( defined $location && $location ne '' ) {
399 $location = lc $location;
400 if( ! $hosts{$location} ) {
401 $self->throw("Must specify a known host, not $location,".
402 " possible values (".
403 join(",", sort keys %hosts ). ")");
405 $self->{'_hostlocation'} = $location;
407 return $self->{'_hostlocation'};
410 =head2 location_url
412 Title : location
413 Usage : my $url = $self->location_url()
414 Function: Get host url
415 Returns : string representing url
416 Args : none
418 =cut
420 sub location_url {
421 my ($self) = @_;
422 my $servertype = $self->servertype();
423 my $location = $self->hostlocation();
425 if( ! defined $location || !defined $servertype ) {
426 $self->throw("must have a valid hostlocation and servertype set before calling location_url");
428 return sprintf($HOSTS{$servertype}->{'baseurl'},
429 $HOSTS{$servertype}->{'hosts'}->{$location});
432 =head2 request_format
434 Title : request_format
435 Usage : my ($req_format, $ioformat) = $self->request_format;
436 $self->request_format("genbank");
437 $self->request_format("fasta");
438 Function: Get/Set sequence format retrieval. The get-form will normally
439 not be used outside of this and derived modules.
440 Returns : Array of two strings, the first representing the format for
441 retrieval, and the second specifying the corresponding SeqIO
442 format.
443 Args : $format = sequence format
445 =cut
447 sub request_format {
448 my ($self, $value) = @_;
449 if( defined $value ) {
450 if( $self->servertype =~ /expasy/ ) {
451 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
452 $self->{'_format'} = [ 'sprot', 'swiss'];
453 } elsif( $value =~ /^fa/ ) {
454 $self->{'_format'} = [ 'fasta', 'fasta'];
455 } else {
456 $self->warn("Unrecognized format $value requested");
457 $self->{'_format'} = [ 'fasta', 'fasta'];
459 } elsif( $self->servertype =~ /ebi/ ) {
460 if( $value =~ /sprot/ || $value =~ /swiss/ ) {
461 $self->{'_format'} = [ 'swissprot', 'swiss' ];
462 } elsif( $value =~ /^fa/ ) {
463 $self->{'_format'} = [ 'fasta', 'fasta'];
464 } else {
465 $self->warn("Unrecognized format $value requested");
466 $self->{'_format'} = [ 'swissprot', 'swiss'];
470 return @{$self->{'_format'}};
473 =head2 idtracker
475 Title : idtracker
476 Usage : my ($newid) = $self->idtracker($oldid);
477 Function: Retrieve new ID using old ID.
478 Returns : single ID if one is found
479 Args : ID to look for
481 =cut
483 sub idtracker {
484 my ($self, $id) = @_;
485 $self->deprecated(
486 -message => 'The SwissProt IDTracker service is no longer available, '.
487 'use id_mapper() instead',
488 -warn_version => 1.006, # warn if $VERSION is >= this version
489 -throw_version => 1.007 # throw if $VERSION is >= this version
493 =head2 id_mapper
495 Title : id_tracker
496 Usage : my $map = $self->id_mapper( -from => '',
497 -to => '',
498 -ids => \@ids);
499 Function: Retrieve new ID using old ID.
500 Returns : hash reference of successfully mapped IDs
501 Args : -from : database mapping from
502 -to : database mapped to
503 -ids : a single ID or array ref of IDs to map
504 Note : For a list of valid database IDs, see:
505 http://www.uniprot.org/faq/28#id_mapping_examples
507 =cut
509 sub id_mapper {
510 my $self = shift;
511 my ($from, $to, $ids) = $self->_rearrange([qw(FROM TO IDS)], @_);
512 for ($from, $to) {
513 $self->throw("$_ is not a recognized database") if !exists $ID_MAPPING_DATABASES{$_};
515 my @ids = ref $ids ? @$ids : $ids;
516 my $params = {
517 from => $from,
518 to => $to,
519 format => 'tab',
520 query => join(' ',@ids)
522 my $ua = $self->ua;
523 push @{ $ua->requests_redirectable }, 'POST';
524 my $response = $ua->post("http://www.uniprot.org/mapping/", $params);
525 while (my $wait = $response->header('Retry-After')) {
526 $self->debug("Waiting...\n");
527 $self->_sleep;
528 $response = $ua->get($response->base);
531 my %map;
532 if ($response->is_success) {
533 for my $line (split("\n", $response->content)) {
534 my ($id_from, $id_to) = split(/\s+/, $line, 2);
535 next if $id_from eq 'From';
536 push @{$map{$id_from}}, $id_to;
538 } else {
539 $self->throw("Error: ".$response->status_line."\n");
541 \%map;
546 __END__