doc: update HACKING instructions with dzil instructions
[bioperl-live.git] / bin / bp_dbsplit
blobfa2f04adf1b3593f5d3964db3418ff80a436768f
1 #!/usr/bin/perl
2 #-*-Perl-*-
4 =head1 NAME
6 bp_dbsplit - script to split an input set of database(s) into smaller pieces
8 =head1 SYNOPSIS
10 bp_dbsplit.PLS --size 50 [-i inputfile] [-if inputformat] [-of outputformat]
11 [--prefix outputprefix] [ < file1 file 2 OR file1 file2]
13 =head1 DESCRIPTION
15 This script will take as input a list of filenames or a single file or
16 from STDIN a sequence database and split the database into separate
17 files of X numbers of sequences. You specify X with the C<--size/-s>
18 parameter. The input and output sequence format is any that is
19 supported by bioperl (fasta,embl,genbank,gcg, swissprot, etc).
21 You can specify the input data either as a single file with -i
22 filename, or as a single file as an argument like
24 % bp_dbsplit file1 file2
26 or as a list of sequence data with
28 % cat file1 file2 file3 | bp_dbsplit
30 You'll want to use the C<--prefix> to specify what the output prefix will
31 be.
33 =head1 FEEDBACK
35 =head2 Mailing Lists
37 User feedback is an integral part of the evolution of this and other
38 Bioperl modules. Send your comments and suggestions preferably to
39 the Bioperl mailing list. Your participation is much appreciated.
41 bioperl-l@bioperl.org - General discussion
42 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
44 =head2 Reporting Bugs
46 Report bugs to the Bioperl bug tracking system to help us keep track
47 of the bugs and their resolution. Bug reports can be submitted via
48 the web:
50 https://github.com/bioperl/bioperl-live/issues
52 =head1 AUTHOR
54 Jason Stajich, jason-at-bioperl-dot-org
56 =cut
58 use strict;
59 use warnings;
60 use Bio::SeqIO;
61 use Bio::SeqIO::MultiFile;
63 use Getopt::Long;
64 my $dbsize = 100;
65 my $prefix;
66 my ($informat,$outformat,$infile) = ( 'fasta', 'fasta');
68 GetOptions (
69 's|size:s' => \$dbsize,
70 'if:s' => \$informat,
71 'of:s' => \$outformat,
72 'i:s' => \$infile,
73 'p|prefix:s' => \$prefix,
76 if( @ARGV == 1 ) {
77 $infile = shift @ARGV;
79 $prefix ||= $infile || $ARGV[0] || 'db';
81 my $in;
82 if( @ARGV ) {
83 $in = new Bio::SeqIO::MultiFile(-files => [@ARGV],
84 -format => $informat || 'fasta');
85 } elsif( $infile ) {
86 $in = new Bio::SeqIO(-file => $infile,
87 -format=> $informat);
88 } else {
89 $in = new Bio::SeqIO(-format=> $informat);
91 my $count = 1;
92 my $out = new Bio::SeqIO(-format => $outformat,
93 -file => ">$prefix.$count");
94 my $scount = 0;
95 while( my $seq = $in->next_seq ) {
96 if( ++$scount > $dbsize && $count ) {
97 $out->close();
98 undef($out);
99 $count++;
100 $out = new Bio::SeqIO(-format => $outformat,
101 -file => ">$prefix.$count");
102 $scount = 1;
104 $out->write_seq($seq);
108 __END__