10 GetOptions
(\
%opts, "faFile:s", "len:i", "insert:i", "outDir:s", "oriChrName:s", "chrname:s", "project:s");
12 unless (defined $opts{faFile
} && $opts{outDir
} && $opts{project
}) {
13 print "\n\tthis program will generate a new fa file based on the original fa file. The scaffolds will be merged into several chromosomes using \"n\" to sperate them\n\tthe output will be the new fa file and a list with which the scaffold will be traced back\n";
14 print "\n\t-faFile\t\tthe original fa file\n";
15 print "\t-outDir\t\tthe output directory\n";
16 print "\t-project\tproject name\n";
17 print "\t-len\t\tthe len of each new chromosome after merge [10000000]\n";
18 print "\t-insert\t\tthe number of \"n\" insert into the new chromosome between two scaffolds [75]\n";
19 print "\t-oriChrName\tthe original name pattern (case insensitive) [Chr]\n";
20 print "\t-chrname\tthe new chromosome name pattern [ChrNew]\n";
21 print "Example:\n\tperl $0 -faFile /share/tmp/pub/Genome/cucumberBGI/Cucumber.original.fa -outDir /share/raid11/zhanghao/software/preProcess/ -project Cucumber -chrname LG_M\n";
22 print "\toutput: /share/raid11/zhanghao/software/preProcess/Cucumber.merge.fa\n\t\t/share/raid11/zhanghao/software/preProcess/Cucumber.merge.list\n";
23 print "\tperl $0 -faFile /share/tmp/pub/Genome/SorghumBicolor/sbi1.fasta -outDir /share/raid11/zhanghao/software/preProcess/ -project SorghumBicolor -oriChrName chromosome -chrname chromosome_M\n";
24 print "\tAuther: Hao Zhang\tTime: 22:32 21/04/2009\n";
25 print " fixed bye lijun3 Mon Jan 4 17:50:33 CST 2010\n";
29 my $fa_file = $opts{faFile
};
30 my $chr_len = $opts{len
};
31 my $insert_size = $opts{insert
};
32 my $out_dir = $opts{outDir
};
33 my $scaffold_tem = $opts{oriChrName
};
34 my $chr_tem = $opts{chrname
};
35 my $project_name = $opts{project
};
37 $chr_len ||= 10000000;
38 my $offset = $chr_len/1000;
40 $scaffold_tem ||= "Chr";
41 $chr_tem ||= "ChrNew";
42 my $base_per_line = 50;
43 #print"$scaffold_tem\t$chr_tem\n";
46 my $this_chr_index = 1;
49 my $current_scaf = "";
51 my %hScaName = (); #{chr} => (sca1, sca2, sca3...)
53 my $starttime = time();
55 open FA
, $fa_file or die "$!";
56 #my $file_basename = basename $fa_file;
57 #$file_basename =~ s/\.fa//;
58 open MERGEFA
, ">$out_dir/$project_name.merge.fa" or die "$!";
59 my $base_per_line_mark = 1;
60 while (my $line = <FA
>) {
63 if ($line !~ /$scaffold_tem/i) {
64 my $scaffold_name = (split /\s+/, $line)[0];
65 $scaffold_name =~ s/^>//;
66 if ($this_chr_index > ($chr_len - $offset)) { #finish merging one chromosome, and will start merge a new one.
67 $hChrInfo{$chr_num}{$current_scaf} .= "\t$this_chr_index";
72 $current_chr = "${chr_tem}0$chr_num";
76 $current_chr = "$chr_tem$chr_num";
79 $current_scaf = $scaffold_name;
81 $hChrInfo{$chr_num}{$current_scaf} = $this_chr_index;
82 push @
{$hScaName{$chr_num}}, $current_scaf;
83 print MERGEFA
"\n\>$current_chr\n";
85 else { #still the previous one
86 if ((1 == $this_chr_index) and (1 == $chr_num)) { #the first scaffold
87 $current_chr = "${chr_tem}0$chr_num";
88 $current_scaf = $scaffold_name;
89 $hChrInfo{$chr_num}{$current_scaf} = $this_chr_index;
90 push @
{$hScaName{$chr_num}}, $current_scaf;
91 print MERGEFA
">$current_chr\n";
94 my $end_index = $this_chr_index - 1;
95 $hChrInfo{$chr_num}{$current_scaf} .= "\t$end_index";
96 for(my $count = $this_chr_index; ($this_chr_index - $count) < $insert_size; ++$this_chr_index) {
98 if (0 == ($this_chr_index % $base_per_line)) {
102 $current_scaf = $scaffold_name;
103 $hChrInfo{$chr_num}{$current_scaf} = $this_chr_index;
104 push @
{$hScaName{$chr_num}}, $current_scaf;
111 print MERGEFA
"$line\n";
116 print MERGEFA
"$line\n";
117 if ($base_per_line_mark) {
118 my @count_base_per_line = unpack "(a)*", $line;
119 $base_per_line = @count_base_per_line;
120 $base_per_line_mark = 0;
124 my @bases = unpack "(a)*", $line;
125 foreach my $base (@bases) {
126 print MERGEFA
"$base";
127 if(0 == ($this_chr_index % $base_per_line)) {
135 my $end_index = $this_chr_index - 1;
136 $hChrInfo{$chr_num}{$current_scaf} .= "\t$end_index";
140 open LIST
, ">$out_dir/$project_name.merge.list" or die "$!";
141 open CHRORDER
, ">$out_dir/$project_name.merge.chrorder" or die "$!";
142 foreach my $chr (sort {$a<=>$b} keys %hChrInfo) {
143 print CHRORDER
"$chr_tem$chr\n";
144 foreach my $scaf (@
{$hScaName{$chr}}) {
145 print LIST
"$chr_tem$chr\t$scaf\t$hChrInfo{$chr}{$scaf}\n";
151 my $endtime = time();
152 my $processtime = $endtime - $starttime;
153 print "take $processtime seconds to process $fa_file\n";