8 Function: (1) split scaffold sequences into contig sequnences;
9 (2) get contig coordinates and length;
10 (3) get gap coordinates and length
12 Contact: Fan wei,<fanw\@genomics.org.cn>
14 Version: 1.0, release 2010-1-7
16 Usage: perl split_scaffold.pl [options] <input_scaffold_file.fa>
17 -N <num> number of consecutive Ns for split site,default N=1
18 -out <str> three alternative types: contig_seq, contig_coor, gap_coor, default=contig_seq
19 -cutoff <num> set the size cutoff for contig, only contig > cutoff will be output
20 -detail output middle information to screen
21 -help output help information to screen
23 Example: perl split_scaffold.pl scaffold.fa > contig.fa
24 Note that the -cutoff option can only be co-used with contig_seq, contig_coor.
28 GetOptions
(\
%opts,"N:i", "out:s", "cutoff:i", "detail!","help!");
29 die $usage if ( @ARGV==0 || defined($opts{"help"}));
31 ####################################################
32 ################# Main Function ###################
33 ####################################################
35 my $cut_N= (exists $opts{N
}) ?
$opts{N
} : 1;
36 my $cutoff =(exists $opts{cutoff
}) ?
$opts{cutoff
} : 0;
37 my $out_type = (exists $opts{out
}) ?
$opts{out
} : "contig_seq";
39 open(IN
, $file) || die ("can not open $file\n");
40 $/=">"; <IN>; $/="\n";
42 my $chr=$1 if(/^(\S+)/);
50 print STDERR
"\nSplit $chr\n" if (exists $opts{detail
});
53 my ($start,$end,$frag,$pos);
56 while ($seq=~s/^(N+)//g) {
60 ##cut scaffold to contigs
61 while ($seq=~s/^([^N]+)(N*)//) {
64 $pos += $len_A+$len_N;
65 if ($len_N >= $cut_N || !$seq) {
68 $start = $end - $len_A + 1;
70 my $gap_start = $pos - $len_N + 1;
71 push @frag,[$frag, $start,$end, $len_A, $gap_start, $gap_end, $len_N];
79 ##output the result in various format
81 for (my $i=0; $i<$frag_num; $i++) {
82 Display_seq
(\
$frag[$i][0]);
84 my $contig_id = "$chr\_$mark";
86 print ">$contig_id $chr $frag[$i][1] $frag[$i][2] $frag[$i][3]\n$frag[$i][0]" if ($out_type eq "contig_seq" && $frag[$i][3]>=$cutoff);
87 print "$chr $frag[$i][1] $frag[$i][2] $frag[$i][3]\n" if ($out_type eq "contig_coor" && $frag[$i][3]>=$cutoff);
88 print "$chr $frag[$i][4] $frag[$i][5] $frag[$i][6]\n" if ($out_type eq "gap_coor" && $frag[$i][6]>0);
98 #display a sequence in specified number on each line
99 #usage: disp_seq(\$string,$num_line);
100 # disp_seq(\$string);
101 #############################################
104 my $num_line=(@_) ?
shift : 50; ##set the number of charcters in each line
108 for (my $i=0; $i<length($$seq_p); $i+=$num_line) {
109 $disp .= substr($$seq_p,$i,$num_line)."\n";
111 $$seq_p = ($disp) ?
$disp : "\n";
113 #############################################