modified: pixi.toml
[GalaxyCodeBases.git] / released / pIRS.old / pIRS_simulator / extra / split_scaffold.pl
blobf2f93485b378a74e8588d8cc15067c47ea7168bd
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Long;
4 my %opts;
6 my $usage=<<USAGE;
8 Function: (1) split scaffold sequences into contig sequnences;
9 (2) get contig coordinates and length;
10 (3) get gap coordinates and length
12 Contact: Fan wei,<fanw\@genomics.org.cn>
14 Version: 1.0, release 2010-1-7
16 Usage: perl split_scaffold.pl [options] <input_scaffold_file.fa>
17 -N <num> number of consecutive Ns for split site,default N=1
18 -out <str> three alternative types: contig_seq, contig_coor, gap_coor, default=contig_seq
19 -cutoff <num> set the size cutoff for contig, only contig > cutoff will be output
20 -detail output middle information to screen
21 -help output help information to screen
23 Example: perl split_scaffold.pl scaffold.fa > contig.fa
24 Note that the -cutoff option can only be co-used with contig_seq, contig_coor.
26 USAGE
28 GetOptions(\%opts,"N:i", "out:s", "cutoff:i", "detail!","help!");
29 die $usage if ( @ARGV==0 || defined($opts{"help"}));
31 ####################################################
32 ################# Main Function ###################
33 ####################################################
34 my $file=shift;
35 my $cut_N= (exists $opts{N}) ? $opts{N} : 1;
36 my $cutoff =(exists $opts{cutoff}) ? $opts{cutoff} : 0;
37 my $out_type = (exists $opts{out}) ? $opts{out} : "contig_seq";
39 open(IN, $file) || die ("can not open $file\n");
40 $/=">"; <IN>; $/="\n";
41 while (<IN>) {
42 my $chr=$1 if(/^(\S+)/);
43 $/=">";
44 my $seq=<IN>;
45 chomp $seq;
46 $seq=~s/\s//g;
47 $seq = uc $seq;
48 $/="\n";
50 print STDERR "\nSplit $chr\n" if (exists $opts{detail});
52 my @frag;
53 my ($start,$end,$frag,$pos);
55 #Remove Ns at heads
56 while ($seq=~s/^(N+)//g) {
57 $pos+=length($1);
60 ##cut scaffold to contigs
61 while ($seq=~s/^([^N]+)(N*)//) {
62 my $len_A=length($1);
63 my $len_N=length($2);
64 $pos += $len_A+$len_N;
65 if ($len_N >= $cut_N || !$seq) {
66 $frag .= $1;
67 $end=$pos-$len_N;
68 $start = $end - $len_A + 1;
69 my $gap_end = $pos;
70 my $gap_start = $pos - $len_N + 1;
71 push @frag,[$frag, $start,$end, $len_A, $gap_start, $gap_end, $len_N];
72 $frag="";
73 }else{
74 $frag .= $1.$2;
79 ##output the result in various format
80 my $frag_num = @frag;
81 for (my $i=0; $i<$frag_num; $i++) {
82 Display_seq(\$frag[$i][0]);
83 my $mark = $i+1;
84 my $contig_id = "$chr\_$mark";
86 print ">$contig_id $chr $frag[$i][1] $frag[$i][2] $frag[$i][3]\n$frag[$i][0]" if ($out_type eq "contig_seq" && $frag[$i][3]>=$cutoff);
87 print "$chr $frag[$i][1] $frag[$i][2] $frag[$i][3]\n" if ($out_type eq "contig_coor" && $frag[$i][3]>=$cutoff);
88 print "$chr $frag[$i][4] $frag[$i][5] $frag[$i][6]\n" if ($out_type eq "gap_coor" && $frag[$i][6]>0);
93 close(IN);
98 #display a sequence in specified number on each line
99 #usage: disp_seq(\$string,$num_line);
100 # disp_seq(\$string);
101 #############################################
102 sub Display_seq{
103 my $seq_p=shift;
104 my $num_line=(@_) ? shift : 50; ##set the number of charcters in each line
105 my $disp;
107 $$seq_p =~ s/\s//g;
108 for (my $i=0; $i<length($$seq_p); $i+=$num_line) {
109 $disp .= substr($$seq_p,$i,$num_line)."\n";
111 $$seq_p = ($disp) ? $disp : "\n";
113 #############################################