7 bp_split_seq - splits a sequence into equal sized chunks with an optional
12 bp_split_seq -c 10000 [-o 1000] [-i] -f seq.in
16 The script will split sequences into chunks
20 -c Desired length of the resulting sequences.
21 -f Input file (must be FASTA format).
25 -o Overlapping range between the resulting sequences.
26 -i Create an index file with the resulting sequence files. This is
27 useful if you want to pass this list as input arguments into
28 another programs (i.e. CLUSTAL, HMMER, etc.).
34 User feedback is an integral part of the evolution of this and other
35 Bioperl modules. Send your comments and suggestions preferably to
36 the Bioperl mailing list. Your participation is much appreciated.
38 bioperl-l@bioperl.org - General discussion
39 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 Report bugs to the Bioperl bug tracking system to help us keep track
44 of the bugs and their resolution. Bug reports can be submitted via the
47 https://github.com/bioperl/bioperl-live/issues
51 Ewan Birney E<lt>birney-at-ebi.ac.ukE<gt>
52 Mauricio Herrera Cuadra E<lt>mauricio at intelligenomes.comE<gt>
57 # Modules, pragmas and variables to use
61 use vars
qw($opt_c $opt_o $opt_i $opt_f $index_file);
63 # Gets options from the command line
64 GetOptions qw(-c=i -o:i -i -f=s);
66 # If no mandatory options are given prints an error and exits
68 print "ERROR: No chunk size has been specified.\n" and exit();
70 print "ERROR: No FASTA file has been specified.\n" and exit();
73 # Declares offset size
74 my $offset = $opt_o ?
$opt_o : "0";
76 # Opens the FASTA file
77 my $in = Bio
::SeqIO
->new(
81 print "==> Opening FASTA file:\t\t\t\t$opt_f\n";
83 # Reads the next sequence object
84 while (my $seq = $in->next_seq()) {
86 # Reads the ID for the sequence and prints it
88 print "--> The ID for this sequence is:\t\t$id\n";
90 # Reads the description for the sequence and prints it
91 my $desc = $seq->desc();
92 print "--> The description for this sequence is:\t$desc\n";
94 # Gets sequence length and prints it
95 my $seq_length = $seq->length();
96 print "--> The length of this sequence is:\t\t$seq_length\n";
98 # If the chunk size is bigger than the sequence length prints the error and exits
99 (print "ERROR: Specified chunk size is bigger than sequence length.\n" and exit()) if ($opt_c > $seq_length);
101 # Creates a directory for writing the resulting files
102 mkdir("split", 0755) unless -e
"split" and -d
"split";
104 # Creates the INDEX file if the option was given
107 $index_file = "$id.c$opt_c.o$offset.INDEX";
108 open $FH, '>', $index_file or die "Could not write file '$index_file': $!\n";
111 # Loops through the sequence
112 for (my $i = 1; $i < $seq_length; $i += $opt_c) {
113 my $end = (($i + $opt_c) > $seq_length) ?
($seq_length + 1) : ($i + $opt_c);
114 my $seq_range = (($i + $opt_c) > $seq_length) ?
"$i-".($end - 1) : "$i-$end";
116 $id .= "_$seq_range";
118 # Stores chunk into its corresponding FASTA file
119 my $out = Bio
::SeqIO
->new(
120 -file
=> ">split/$id.faa",
123 my $trunc_seq = $seq->trunc($i, $end - 1);
125 $out->write_seq($trunc_seq);
126 print "==> Sequence chunk:\t$seq_range\tstored in file:\tsplit/$id.faa\n";
128 # Prints the current file name into the INDEX file if the option was given
129 print $FH "split/$id.faa\n" if $opt_i;
131 # Decreases the $i value with the offset value
135 # Closes the INDEX file if the option was given
137 print "==> INDEX stored in file:\t\t\t$index_file\n";