bin/bp_fetch

   1 #!/usr/bin/perl
   2
   3 =head1 NAME
   4
   5 bp_fetch.pl - fetches sequences from bioperl indexed databases
   6
   7 =head1 SYNOPSIS
   8
   9   bp_fetch.pl swiss:ROA1_HUMAN
  10
  11   bp_fetch.pl net::genbank:JX295726
  12
  13   bp_fetch.pl net::genpept:ROA1_HUMAN
  14
  15   bp_fetch.pl ace::myserver.somewhere.edu,21000:X56676
  16
  17   bp_fetch.pl -fmt GCG swiss:ROA1_HUMAN
  18
  19 =head1 DESCRIPTION
  20
  21 Fetches sequences using the DB access systems in Bioperl. The most
  22 common use of this is to bp_fetch sequences from bioperl indices built
  23 using bpindex.pl, or to fetch sequences from the NCBI website
  24
  25 The format for retrieving sequences is delibrately like the
  26 GCG/EMBOSS format like the following:
  27
  28   db:name
  29
  30 with the potential of putting in a 'meta' database type, being
  31
  32   meta::db:name
  33
  34 The meta information can be one of three types
  35
  36   local - local indexed flat file database
  37   net   - networked http: based database
  38   ace   - ACeDB database
  39
  40 This information defaults to 'local' for database names with no meta
  41 db information
  42
  43 =head1 OPTIONS
  44
  45   -fmt  <format> - Output format
  46                    Fasta (default), EMBL, Raw, swiss or GCG
  47   -acc           - string is an accession number, not an
  48                    id.
  49
  50 options only for expert use
  51
  52   -dir  <dir>    - directory to find the index files
  53                   (overrides BIOPERL_INDEX environment variable)
  54   -type <type>   - type of DBM file to open
  55                   (overrides BIOPERL_INDEX_TYPE environment variable)
  56
  57 =head1 ENVIRONMENT
  58
  59 bp_index and bp_fetch coordinate where the databases lie using the
  60 environment variable BIOPERL_INDEX. This can be overridden using the
  61 -dir option. The index type (SDBM or DB_File or another index file)
  62 is controlled by the BIOPERL_INDEX_TYPE variable. This defaults to
  63 SDBM_File
  64
  65 =head1 USING IT YOURSELF
  66
  67 bp_fetch is a wrapper around the bioperl modules which support
  68 the Bio::DB::BioSeqI abstract interface. These include:
  69
  70   Author          Code
  71
  72   James Gilbert - Fasta indexer, Abstract indexer
  73   Aaron Mackay  - GenBank and GenPept DB access
  74   Ewan Birney   - EMBL .dat indexer
  75   Many people   - SeqIO code
  76
  77 These modules can be used directly, which is far better than using
  78 this script as a system call or a pipe to read from. Read the
  79 source code for bp_fetch to see how it is used.
  80
  81 =head1 EXTENDING IT
  82
  83 bp_fetch uses a number of different modules to provide access to
  84 databases. Any module which subscribes to the Bio::DB::BioSeqI
  85 interface can be used here. For flat file indexers, this is
  86 best done by extending Bio::Index::Abstract, as is done in
  87 Bio::Index::EMBL and Bio::Index::Fasta. For access to other
  88 databases you will need to roll your own interface.
  89
  90 For new output formats, you need to add a new SeqIO module. The
  91 easiest thing is to look at Bio::SeqIO::Fasta and figure out
  92 how to hack it for your own format (call it something different
  93 obviously).
  94
  95 =head1 FEEDBACK
  96
  97 =head2 Mailing Lists
  98
  99 User feedback is an integral part of the evolution of this and other
 100 Bioperl modules. Send your comments and suggestions preferably to
 101 the Bioperl mailing list.  Your participation is much appreciated.
 102
 103   bioperl-l@bioperl.org                  - General discussion
 104   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 105
 106 =head2 Reporting Bugs
 107
 108 Report bugs to the Bioperl bug tracking system to help us keep track
 109 of the bugs and their resolution. Bug reports can be submitted via the
 110 web:
 111
 112   https://github.com/bioperl/bioperl-live/issues
 113
 114 =head1 AUTHOR
 115
 116 Ewan Birney E<lt>birney@ebi.ac.ukE<gt>
 117
 118 =cut
 119
 120 use strict;
 121 use warnings;
 122 use Getopt::Long;
 123
 124 use Bio::Index::EMBL;
 125 use Bio::Index::Fasta;
 126 use Bio::Index::GenBank;
 127 use Bio::Index::SwissPfam;
 128 use Bio::Index::Swissprot;
 129 use Bio::SeqIO;
 130
 131 #
 132 # Start processing the command line
 133 #
 134
 135 my $dir = $ENV{'BIOPERL_INDEX'};
 136 my $type = $ENV{'BIOPERL_INDEX_TYPE'};
 137 my $fmt = 'Fasta';
 138 my $useacc = 0;
 139 my $ret = GetOptions('d|dir=s' => \$dir,
 140                      'f|fmt=s' => \$fmt ,
 141                      't|type=s' => \$type ,
 142                      'acc!' => \$useacc);
 143
 144 #
 145 # print pod documentation if we have no arguments
 146 #
 147
 148 exec('perldoc',$0) unless @ARGV;
 149
 150 my ($isnet,$db,$dbobj,$id,$seq,$seqio,$out,$meta);
 151
 152 #
 153 # Reset the type if needed
 154 #
 155
 156 if( $type ) {
 157    $Bio::Index::Abstract::USE_DBM_TYPE = $type;
 158 }
 159
 160 #
 161 # Build at run time the SeqIO output
 162 #
 163 if ( $fmt !~ /swisspfam|pfam/ ) {
 164   $out = Bio::SeqIO->new(-fh => \*STDOUT , -format => $fmt);
 165 }
 166
 167 #
 168 # Main loop over remaining arguments
 169 #
 170
 171 for my $arg ( @ARGV ) {
 172   $_= $arg;
 173   # strip out meta:: if there
 174   if ( /^(\w+)::/ ) {
 175     $meta = $1;
 176     s/^(\w+):://;
 177   } else {
 178     $meta = 'local';
 179   }
 180
 181   # parse to db:id
 182
 183   /^(\S+)\:(\S+)$/ || do { warn "$_ is not parsed as db:name\n"; next; };
 184   ($db,$id) = split/:/,$_,2;
 185   #
 186   # the eval block catches exceptions if they occur
 187   # in the code in the block. The exception goes in $@
 188   #
 189
 190   eval {
 191     SWITCH : {
 192       $_ = $meta;
 193       /^net$/ && do {
 194         if ( $db =~ /genbank/i ) {
 195           require Bio::DB::GenBank;
 196           $dbobj = Bio::DB::GenBank->new(-format => $fmt);
 197         } elsif ( $db =~ /genpept/i ) {
 198           require Bio::DB::GenPept;
 199           $dbobj = Bio::DB::GenPept->new();
 200         } elsif ( $db =~ /embl/i ) {
 201           require Bio::DB::EMBL;
 202           $dbobj = Bio::DB::EMBL->new();
 203         } else {
 204           die "Net database $db not available";
 205         }
 206         last SWITCH;
 207       };
 208       /^ace$/ && do {
 209         # yank in Bio::DB::Ace at runtime
 210         eval {
 211           require Bio::DB::Ace;
 212         };
 213         if ( $@ ) {
 214           die "Unable to load Bio::DB::Ace for ace::$db\n\n$@\n";
 215         }
 216
 217         # db is server,port
 218         my ($server,$port);
 219
 220         $db =~ /(\S+)\,(\d+)/ || die "$db is not server.name,port for acedb database";
 221         $server = $1;
 222         $port = $2;
 223         # print STDERR "Connecting to $server,$port\n";
 224
 225         $dbobj = Bio::DB::Ace->new(-host => $server, -port => $port);
 226         last SWITCH;
 227       };
 228       /^local$/ && do {
 229         if ( !$dir ) {
 230           die "\nNo directory specified for index\nDirectory must be specified by the environment variable BIOPERL_INDEX or --dir option\ngo bp_index with no arguments for more help\n\n";
 231         }
 232
 233         #
 234         # $db gets re-blessed to the correct index when
 235         # it is made from the abstract class. Cute eh?
 236         #
 237
 238         $dbobj = Bio::Index::Abstract->new("$dir/$db");
 239         last SWITCH;
 240       };
 241       die "Meta database $meta is not valid";
 242     }
 243     };                          # end of eval to get db
 244   if ( $@ ) {
 245     warn("Database $db in $arg is not loadable. Skipping\n\nError $@");
 246     next;
 247   }
 248
 249   #
 250   # We expect the databases to adhere to the BioSeqI
 251   # the sequence index databases and the GenBank/GenPept do already
 252   #
 253   if ( $dbobj->isa("Bio::Index::SwissPfam") ) {
 254     my $seq = $dbobj->fetch($id);
 255     if ( $seq ) {
 256       my $started;
 257       while ( <$seq> ) {
 258         last if ( /^\s+$/ );
 259         print;
 260       }
 261     } else {
 262       warn("Cannot find $id\n");
 263     }
 264     next;
 265   }
 266   if ( ! $dbobj->isa('Bio::DB::RandomAccessI') ) {
 267     warn("$db in $arg does not inherit from Bio::DB::RandomAccessI, so is not expected to work under the DB guidlines. Going to try it anyway");
 268   }
 269   eval {
 270     if ( $useacc == 0 ) {
 271       $seq = $dbobj->get_Seq_by_id($id);
 272     } else {
 273       $seq = $dbobj->get_Seq_by_acc($id);
 274     }
 275   };
 276   if ( $@ ) {
 277     warn("Sequence $id in Database $db in $arg is not loadable. Skipping.\n\nError $@");
 278     next;
 279   } elsif ( !defined $seq ) {
 280     warn("Sequence $id in Database $db is not present\n");
 281     next;
 282   }
 283   $out->write_seq($seq);
 284 }